diff --git a/llvm/include/llvm/CodeGen/RegisterClassInfo.h b/llvm/include/llvm/CodeGen/RegisterClassInfo.h index 800bebea0dddb..417a1e40d02b9 100644 --- a/llvm/include/llvm/CodeGen/RegisterClassInfo.h +++ b/llvm/include/llvm/CodeGen/RegisterClassInfo.h @@ -141,16 +141,11 @@ class RegisterClassInfo { } /// Get the register unit limit for the given pressure set index. - /// - /// RegisterClassInfo adjusts this limit for reserved registers. unsigned getRegPressureSetLimit(unsigned Idx) const { if (!PSetLimits[Idx]) - PSetLimits[Idx] = computePSetLimit(Idx); + PSetLimits[Idx] = TRI->getRegPressureSetLimit(*MF, Idx); return PSetLimits[Idx]; } - -protected: - unsigned computePSetLimit(unsigned Idx) const; }; } // end namespace llvm diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index f4bf74c8caa5b..bdb881ee62232 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -925,9 +925,16 @@ class TargetRegisterInfo : public MCRegisterInfo { virtual const char *getRegPressureSetName(unsigned Idx) const = 0; /// Get the register unit pressure limit for this dimension. - /// This limit must be adjusted dynamically for reserved registers. + /// TargetRegisterInfo adjusts this limit for reserved registers. + /// Avoid using this method directly as it is costly to compute. Use the + /// cached version `RegisterClassInfo::getRegPressureSetLimit` instead. virtual unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const = 0; + unsigned Idx) const; + + /// Get the raw register unit pressure limit for this dimension. + /// This limit must be adjusted dynamically for reserved registers. + virtual unsigned getRawRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const = 0; /// Get the dimensions of register pressure impacted by this register class. /// Returns a -1 terminated array of pressure set IDs. diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp index 9312bc03bc522..976d41a54da56 100644 --- a/llvm/lib/CodeGen/RegisterClassInfo.cpp +++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp @@ -195,40 +195,3 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { // RCI is now up-to-date. RCI.Tag = Tag; } - -/// This is not accurate because two overlapping register sets may have some -/// nonoverlapping reserved registers. However, computing the allocation order -/// for all register classes would be too expensive. -unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const { - const TargetRegisterClass *RC = nullptr; - unsigned NumRCUnits = 0; - for (const TargetRegisterClass *C : TRI->regclasses()) { - const int *PSetID = TRI->getRegClassPressureSets(C); - for (; *PSetID != -1; ++PSetID) { - if ((unsigned)*PSetID == Idx) - break; - } - if (*PSetID == -1) - continue; - - // Found a register class that counts against this pressure set. - // For efficiency, only compute the set order for the largest set. - unsigned NUnits = TRI->getRegClassWeight(C).WeightLimit; - if (!RC || NUnits > NumRCUnits) { - RC = C; - NumRCUnits = NUnits; - } - } - assert(RC && "Failed to find register class"); - compute(RC); - unsigned NAllocatableRegs = getNumAllocatableRegs(RC); - unsigned RegPressureSetLimit = TRI->getRegPressureSetLimit(*MF, Idx); - // If all the regs are reserved, return raw RegPressureSetLimit. - // One example is VRSAVERC in PowerPC. - // Avoid returning zero, getRegPressureSetLimit(Idx) assumes computePSetLimit - // return non-zero value. - if (NAllocatableRegs == 0) - return RegPressureSetLimit; - unsigned NReserved = RC->getNumRegs() - NAllocatableRegs; - return RegPressureSetLimit - TRI->getRegClassWeight(RC).RegWeight * NReserved; -} diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index af62623ece6ab..4da1d5920894b 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -715,6 +715,49 @@ TargetRegisterInfo::prependOffsetExpression(const DIExpression *Expr, PrependFlags & DIExpression::EntryValue); } +unsigned TargetRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const { + const TargetRegisterClass *RC = nullptr; + unsigned NumRCUnits = 0; + for (const TargetRegisterClass *C : regclasses()) { + const int *PSetID = getRegClassPressureSets(C); + for (; *PSetID != -1; ++PSetID) { + if ((unsigned)*PSetID == Idx) + break; + } + if (*PSetID == -1) + continue; + + // Found a register class that counts against this pressure set. + // For efficiency, only compute the set order for the largest set. + unsigned NUnits = getRegClassWeight(C).WeightLimit; + if (!RC || NUnits > NumRCUnits) { + RC = C; + NumRCUnits = NUnits; + } + } + assert(RC && "Failed to find register class"); + + unsigned NAllocatableRegs = 0; + const BitVector &Reserved = MF.getRegInfo().getReservedRegs(); + for (MCPhysReg PhysReg : RC->getRawAllocationOrder(MF)) + if (!Reserved.test(PhysReg)) + NAllocatableRegs++; + + unsigned RegPressureSetLimit = getRawRegPressureSetLimit(MF, Idx); + // If all the regs are reserved, return raw RegPressureSetLimit. + // One example is VRSAVERC in PowerPC. + // Avoid returning zero, RegisterClassInfo::getRegPressureSetLimit(Idx) + // assumes this returns non-zero value. + if (NAllocatableRegs == 0) { + LLVM_DEBUG(dbgs() << "All registers of " << getRegClassName(RC) + << " are reserved!\n";); + return RegPressureSetLimit; + } + unsigned NReserved = RC->getNumRegs() - NAllocatableRegs; + return RegPressureSetLimit - getRegClassWeight(RC).RegWeight * NReserved; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index bd2bbb9798312..23f24a9dc9982 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -325,13 +325,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -370,13 +370,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -394,13 +394,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -469,21 +469,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -513,20 +513,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -536,20 +536,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -602,15 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -640,15 +640,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -686,15 +686,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -712,15 +712,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -758,21 +758,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -795,22 +795,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -840,21 +840,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -864,21 +864,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1058,21 +1058,21 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1104,20 +1104,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1127,20 +1127,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1190,15 +1190,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1228,15 +1228,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1274,15 +1274,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1303,15 +1303,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1344,21 +1344,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1381,22 +1381,22 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1427,21 +1427,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1451,24 +1451,24 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v6, v[0:1] -; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1504,21 +1504,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX940-NEXT: v_mov_b32_e32 v2, s16 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, s16 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v3 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1554,20 +1553,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s20 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s20 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1581,24 +1579,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v1, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s20 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -1609,24 +1606,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen +; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s20 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -1664,24 +1660,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX940-NEXT: v_mov_b32_e32 v2, s16 +; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v1, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, s16 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1713,23 +1708,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v1, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, s20 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1739,24 +1733,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX908-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1766,24 +1759,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1810,27 +1802,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: v_mov_b32_e32 v6, s16 +; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] -; GFX12-NEXT: v_mov_b32_e32 v10, s16 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1854,29 +1845,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: v_mov_b32_e32 v6, s16 +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v10, s16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1908,28 +1897,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v6, s20 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_mov_b32_e32 v10, s20 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 -; GFX908-NEXT: v_mov_b32_e32 v2, v8 -; GFX908-NEXT: v_mov_b32_e32 v3, v9 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 @@ -1940,28 +1928,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v6, s20 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v10, s20 -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v3, v9 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 @@ -1989,26 +1976,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s16 +; GFX12-NEXT: v_mov_b32_e32 v6, s16 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 -; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, v2 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2032,28 +2017,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s16 +; GFX11-NEXT: v_mov_b32_e32 v6, s16 +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, v2 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2085,27 +2067,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v6, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX908-NEXT: v_mov_b32_e32 v10, s20 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v9, v5 -; GFX908-NEXT: v_mov_b32_e32 v8, v4 -; GFX908-NEXT: v_mov_b32_e32 v7, v3 -; GFX908-NEXT: v_mov_b32_e32 v6, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, v3 +; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_mov_b32_e32 v8, v1 +; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v6 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v7 +; GFX908-NEXT: v_mov_b32_e32 v3, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2115,27 +2096,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v6, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, s20 -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v9, v5 -; GFX8-NEXT: v_mov_b32_e32 v8, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, v3 -; GFX8-NEXT: v_mov_b32_e32 v6, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v6 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index b646b7f301092..11024b0a88d6b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -325,13 +325,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -370,13 +370,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -394,13 +394,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -469,21 +469,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -513,20 +513,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -536,20 +536,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -602,15 +602,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -640,15 +640,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -686,15 +686,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -712,15 +712,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -758,21 +758,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -795,22 +795,22 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -840,21 +840,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -864,21 +864,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1058,21 +1058,21 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1104,20 +1104,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1127,20 +1127,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1190,15 +1190,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1228,15 +1228,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1274,15 +1274,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1303,15 +1303,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1344,21 +1344,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1381,22 +1381,22 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1427,21 +1427,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1451,24 +1451,24 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v6, v[0:1] -; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1504,21 +1504,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX940-NEXT: v_mov_b32_e32 v2, s16 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, s16 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v3 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1554,20 +1553,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s20 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s20 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1581,24 +1579,23 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v1, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s20 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -1609,24 +1606,23 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen +; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s20 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -1664,24 +1660,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX940-NEXT: v_mov_b32_e32 v2, s16 +; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v1, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX940-NEXT: v_min_f32_e32 v2, v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v6, s16 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1713,23 +1708,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v1, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_min_f32_e32 v2, v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, s20 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1739,24 +1733,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX908-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1766,24 +1759,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1810,27 +1802,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: v_mov_b32_e32 v6, s16 +; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] -; GFX12-NEXT: v_mov_b32_e32 v10, s16 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1854,29 +1845,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: v_mov_b32_e32 v6, s16 +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v10, s16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1908,28 +1897,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v6, s20 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_mov_b32_e32 v10, s20 -; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 -; GFX908-NEXT: v_mov_b32_e32 v2, v8 -; GFX908-NEXT: v_mov_b32_e32 v3, v9 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 @@ -1940,28 +1928,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v6, s20 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v10, s20 -; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v3, v9 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 @@ -1989,26 +1976,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s16 +; GFX12-NEXT: v_mov_b32_e32 v6, s16 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 -; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, v2 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2032,28 +2017,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s16 +; GFX11-NEXT: v_mov_b32_e32 v6, s16 +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, v2 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2085,27 +2067,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v6, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX908-NEXT: v_mov_b32_e32 v10, s20 -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v9, v5 -; GFX908-NEXT: v_mov_b32_e32 v8, v4 -; GFX908-NEXT: v_mov_b32_e32 v7, v3 -; GFX908-NEXT: v_mov_b32_e32 v6, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, v3 +; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_mov_b32_e32 v8, v1 +; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v6 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v7 +; GFX908-NEXT: v_mov_b32_e32 v3, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2115,27 +2096,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v6, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, s20 -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v9, v5 -; GFX8-NEXT: v_mov_b32_e32 v8, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, v3 -; GFX8-NEXT: v_mov_b32_e32 v6, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v6 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 45a9f9d47acba..823db84a053b8 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -557,11 +557,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7 ; GFX908-NEXT: s_mul_i32 s0, s0, s7 ; GFX908-NEXT: s_add_i32 s1, s9, s1 -; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 +; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %Flow20 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX908-NEXT: s_cbranch_vccz .LBB3_12 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 @@ -571,15 +571,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 ; GFX908-NEXT: s_mov_b32 s7, s6 +; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] ; GFX908-NEXT: v_mov_b32_e32 v4, s6 +; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: v_mov_b32_e32 v9, s7 ; GFX908-NEXT: v_mov_b32_e32 v5, s7 ; GFX908-NEXT: v_mov_b32_e32 v7, s7 ; GFX908-NEXT: v_mov_b32_e32 v8, s6 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0 -; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 ; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 @@ -599,9 +601,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s18, s18, s0 +; GFX908-NEXT: s_add_u32 s18, s18, s14 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s19, s19, s1 +; GFX908-NEXT: s_addc_u32 s19, s19, s15 ; GFX908-NEXT: s_mov_b64 s[20:21], 0 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 @@ -620,7 +622,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: ds_read_b64 v[12:13], v19 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 @@ -648,7 +650,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_mov_b64 s[20:21], -1 ; GFX908-NEXT: s_branch .LBB3_4 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] +; GFX908-NEXT: s_mov_b64 s[20:21], s[16:17] ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 ; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 @@ -659,7 +661,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_xor_b64 s[16:17], s[20:21], -1 ; GFX908-NEXT: .LBB3_10: ; %Flow19 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[14:15], -1 +; GFX908-NEXT: s_mov_b64 s[0:1], -1 ; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 @@ -668,7 +670,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_addc_u32 s5, s5, 0 ; GFX908-NEXT: s_add_u32 s10, s10, s12 ; GFX908-NEXT: s_addc_u32 s11, s11, s13 -; GFX908-NEXT: s_mov_b64 s[14:15], 0 +; GFX908-NEXT: s_mov_b64 s[0:1], 0 ; GFX908-NEXT: s_branch .LBB3_1 ; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX908-NEXT: s_endpgm @@ -718,11 +720,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s7 ; GFX90A-NEXT: s_mul_i32 s0, s0, s7 ; GFX90A-NEXT: s_add_i32 s1, s9, s1 -; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 +; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %Flow20 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 @@ -732,12 +734,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 ; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0 -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 ; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -756,8 +760,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s18, s18, s0 -; GFX90A-NEXT: s_addc_u32 s19, s19, s1 +; GFX90A-NEXT: s_add_u32 s18, s18, s14 +; GFX90A-NEXT: s_addc_u32 s19, s19, s15 ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[4:5] ; GFX90A-NEXT: s_mov_b64 s[20:21], 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] @@ -777,7 +781,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 @@ -798,7 +802,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_mov_b64 s[20:21], -1 ; GFX90A-NEXT: s_branch .LBB3_4 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15] +; GFX90A-NEXT: s_mov_b64 s[20:21], s[16:17] ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 @@ -809,7 +813,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_xor_b64 s[16:17], s[20:21], -1 ; GFX90A-NEXT: .LBB3_10: ; %Flow19 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[14:15], -1 +; GFX90A-NEXT: s_mov_b64 s[0:1], -1 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 @@ -818,7 +822,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_addc_u32 s5, s5, 0 ; GFX90A-NEXT: s_add_u32 s10, s10, s12 ; GFX90A-NEXT: s_addc_u32 s11, s11, s13 -; GFX90A-NEXT: s_mov_b64 s[14:15], 0 +; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_branch .LBB3_1 ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 48c7a9f5e2d14..e8f1619c5d418 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -54,23 +54,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 @@ -95,18 +95,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 @@ -122,18 +122,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 @@ -149,18 +149,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB0_1 @@ -176,19 +176,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB0_1 @@ -241,23 +241,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 @@ -290,19 +290,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -316,19 +316,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -342,20 +342,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -827,23 +827,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -859,12 +859,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -885,18 +885,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 @@ -912,18 +912,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 @@ -939,18 +939,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 @@ -966,19 +966,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 @@ -1031,23 +1031,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 @@ -1062,11 +1062,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1087,19 +1087,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1113,19 +1113,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1139,19 +1139,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1165,20 +1165,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1218,24 +1218,25 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1249,23 +1250,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB5_1 @@ -1281,12 +1282,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1307,18 +1308,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 @@ -1334,18 +1335,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 @@ -1361,18 +1362,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB5_1 @@ -1388,19 +1389,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB5_1 @@ -1441,24 +1442,25 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1472,23 +1474,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 @@ -1504,12 +1506,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1530,18 +1532,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 @@ -1557,18 +1559,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 @@ -1584,18 +1586,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 @@ -1611,19 +1613,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 @@ -1664,24 +1666,25 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1695,23 +1698,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 @@ -1727,12 +1730,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1753,18 +1756,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 @@ -1780,18 +1783,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 @@ -1807,18 +1810,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 @@ -1834,19 +1837,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 @@ -1873,25 +1876,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v10, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1916,25 +1919,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX11-NEXT: v_mov_b32_e32 v10, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1949,26 +1952,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_add_i32 s5, s20, 0x800 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, s5 +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v6 -; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mov_b32_e32 v3, v9 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 @@ -1994,21 +1997,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 -; GFX908-NEXT: v_mov_b32_e32 v2, v8 -; GFX908-NEXT: v_mov_b32_e32 v3, v9 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 @@ -2025,21 +2028,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v3, v9 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 @@ -2056,21 +2059,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v10, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-NEXT: v_mov_b32_e32 v3, v9 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 @@ -2087,22 +2090,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v9, v1 -; GFX6-NEXT: v_mov_b32_e32 v8, v0 -; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v6 -; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v3, v9 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 @@ -2124,24 +2127,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 -; GFX12-NEXT: v_mov_b32_e32 v8, v4 +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2165,24 +2169,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_mov_b32_e32 v8, v4 +; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2195,26 +2200,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x800 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_mov_b32_e32 v10, s5 -; GFX10-NEXT: v_mov_b32_e32 v8, v4 +; GFX10-NEXT: v_mov_b32_e32 v10, v5 +; GFX10-NEXT: v_mov_b32_e32 v9, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v7, v3 -; GFX10-NEXT: v_mov_b32_e32 v6, v2 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-NEXT: v_mov_b32_e32 v5, v8 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 @@ -2238,22 +2243,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX908-NEXT: v_mov_b32_e32 v9, v5 -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_mov_b32_e32 v8, v4 -; GFX908-NEXT: v_mov_b32_e32 v7, v3 -; GFX908-NEXT: v_mov_b32_e32 v6, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v5 +; GFX908-NEXT: v_mov_b32_e32 v9, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v6 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v7 +; GFX908-NEXT: v_mov_b32_e32 v5, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2267,22 +2272,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v9, v5 -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_mov_b32_e32 v8, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, v3 -; GFX8-NEXT: v_mov_b32_e32 v6, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v5 +; GFX8-NEXT: v_mov_b32_e32 v9, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v6 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v7 +; GFX8-NEXT: v_mov_b32_e32 v5, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2296,22 +2301,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v9, v5 -; GFX7-NEXT: v_mov_b32_e32 v10, s6 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v10, v5 +; GFX7-NEXT: v_mov_b32_e32 v9, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v6 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2325,23 +2330,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v9, v5 -; GFX6-NEXT: v_mov_b32_e32 v10, s6 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v10, v5 +; GFX6-NEXT: v_mov_b32_e32 v9, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v4, v6 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v5, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2898,25 +2903,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v10, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2941,25 +2946,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX11-NEXT: v_mov_b32_e32 v10, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2974,26 +2979,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_add_i32 s5, s20, 0x800 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, s5 +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v6 -; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mov_b32_e32 v3, v9 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -3010,18 +3015,18 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v10, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 @@ -3038,21 +3043,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 -; GFX908-NEXT: v_mov_b32_e32 v2, v8 -; GFX908-NEXT: v_mov_b32_e32 v3, v9 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 @@ -3069,21 +3074,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v3, v9 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 @@ -3100,21 +3105,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v10, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-NEXT: v_mov_b32_e32 v3, v9 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 @@ -3131,22 +3136,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v9, v1 -; GFX6-NEXT: v_mov_b32_e32 v8, v0 -; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v6 -; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v3, v9 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 @@ -3169,25 +3174,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v10, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -3212,25 +3217,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX11-NEXT: v_mov_b32_e32 v10, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -3245,26 +3250,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_add_i32 s5, s20, 0x800 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, s5 +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v6 -; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mov_b32_e32 v3, v9 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 @@ -3290,21 +3295,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 -; GFX908-NEXT: v_mov_b32_e32 v2, v8 -; GFX908-NEXT: v_mov_b32_e32 v3, v9 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -3321,21 +3326,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v3, v9 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -3352,21 +3357,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v10, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-NEXT: v_mov_b32_e32 v3, v9 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 @@ -3383,22 +3388,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v9, v1 -; GFX6-NEXT: v_mov_b32_e32 v8, v0 -; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v6 -; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v3, v9 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 @@ -3426,43 +3431,43 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v3 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3470,25 +3475,24 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX940-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3498,51 +3502,49 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3550,22 +3552,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc @@ -3574,36 +3575,35 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX90A-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3613,30 +3613,29 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v5, s4 +; GFX908-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 ; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -3648,31 +3647,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 ; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s8, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v3, s7, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3684,37 +3682,36 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3724,7 +3721,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3732,31 +3729,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3766,7 +3762,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3786,42 +3782,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_mov_b32_e32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3829,25 +3825,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX940-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3863,43 +3858,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3907,59 +3900,57 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX90A-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3975,30 +3966,29 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 ; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4009,31 +3999,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 ; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s8, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_and_b32_e32 v4, s7, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4044,35 +4033,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4083,36 +4071,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4781,27 +4768,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX12-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4811,23 +4799,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v2 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4835,33 +4823,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: v_mov_b32_e32 v5, s6 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s9 +; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4871,32 +4858,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4906,97 +4894,95 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, s6 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5006,39 +4992,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5048,41 +5033,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5092,37 +5076,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5132,7 +5115,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5140,31 +5123,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5174,7 +5156,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5192,52 +5174,53 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX12-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v3 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5245,33 +5228,32 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v0, s9 +; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5287,123 +5269,122 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v3 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5419,37 +5400,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5460,39 +5440,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5503,35 +5482,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5542,36 +5520,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6316,24 +6293,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6347,23 +6325,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -6388,18 +6366,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 @@ -6415,20 +6393,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 @@ -6451,30 +6429,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 @@ -6497,31 +6475,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 @@ -6563,23 +6541,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6592,23 +6572,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -6641,21 +6621,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v3, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6666,41 +6646,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 @@ -6712,42 +6692,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_mov_b32_e32 v7, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 @@ -7288,24 +7268,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7319,23 +7300,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -7351,12 +7332,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -7377,18 +7358,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 @@ -7404,20 +7385,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 @@ -7440,30 +7421,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 @@ -7486,31 +7467,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 @@ -7552,23 +7533,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7581,23 +7564,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -7612,11 +7595,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -7637,19 +7620,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7663,21 +7646,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v3, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7688,41 +7671,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 @@ -7734,42 +7717,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_mov_b32_e32 v7, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 @@ -7810,24 +7793,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7841,23 +7825,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -7873,12 +7857,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -7899,18 +7883,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB24_1 @@ -7926,20 +7910,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB24_1 @@ -7962,30 +7946,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 @@ -8008,31 +7992,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 @@ -8074,23 +8058,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8103,23 +8089,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -8134,11 +8120,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8159,19 +8145,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8185,21 +8171,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v3, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8210,41 +8196,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 @@ -8256,42 +8242,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_mov_b32_e32 v7, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 @@ -8326,41 +8312,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s8, s16, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff -; GFX940-NEXT: s_mov_b32 s10, 0x7060302 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX940-NEXT: v_add_f32_e32 v1, v4, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s8 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB26_1 @@ -8371,45 +8357,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s6, s16, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v1, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 -; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8422,41 +8410,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s6, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 -; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 -; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s6 -; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -8467,40 +8455,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff -; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 -; GFX90A-NEXT: v_mov_b32_e32 v3, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 @@ -8511,41 +8499,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s8, s20, 0x400 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff -; GFX908-NEXT: s_mov_b32 s10, 0x7060302 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s8 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 +; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v1, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 @@ -8556,42 +8544,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s8, s20, 0x400 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 @@ -8603,38 +8591,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8645,39 +8633,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8708,40 +8696,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s8, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff -; GFX940-NEXT: s_mov_b32 s10, 0x7060302 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX940-NEXT: v_add_f32_e32 v2, v5, v4 -; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s8 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8751,45 +8739,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s6, s16, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX11-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v5, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8803,39 +8789,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s6, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v5, s6 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8847,39 +8833,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff -; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX90A-NEXT: v_mov_b32_e32 v6, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8890,40 +8876,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s8, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff -; GFX908-NEXT: s_mov_b32 s10, 0x7060302 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v6, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: v_mov_b32_e32 v6, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8934,41 +8920,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s8, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v6, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8979,38 +8965,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9021,39 +9007,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9106,7 +9092,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -9118,40 +9104,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 ; GFX940-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB28_4 Depth 2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX940-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX940-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; GFX940-NEXT: v_add_f32_e32 v6, v7, v6 -; GFX940-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX940-NEXT: v_add3_u32 v7, v7, v6, s10 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v10 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 +; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX940-NEXT: v_perm_b32 v8, v6, v4, s11 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -9164,27 +9150,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB28_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v9, v6 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB28_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 @@ -9198,41 +9184,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB28_4 Depth 2 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v6, v10, v7 -; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 -; GFX11-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -9246,14 +9233,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB28_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -9261,14 +9248,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB28_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 @@ -9280,38 +9268,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB28_4 Depth 2 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v10, v7 -; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo -; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 ; GFX10-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -9323,15 +9311,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB28_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -9340,13 +9328,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_cbranch_execnz .LBB28_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -9358,38 +9346,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 ; GFX90A-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB28_4 Depth 2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; GFX90A-NEXT: v_add_f32_e32 v6, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -9401,27 +9389,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -9433,39 +9421,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 ; GFX908-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB28_4 Depth 2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX908-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GFX908-NEXT: v_add_f32_e32 v6, v7, v6 -; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc -; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 +; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -9477,27 +9465,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB28_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB28_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -9509,40 +9497,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX8-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB28_4 Depth 2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX8-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GFX8-NEXT: v_add_f32_e32 v6, v7, v6 -; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v7, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -9554,21 +9542,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB28_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB28_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -9742,41 +9730,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s8, s16, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff -; GFX940-NEXT: s_mov_b32 s10, 0x7060302 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX940-NEXT: v_add_f32_e32 v1, v4, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s8 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB29_1 @@ -9787,45 +9775,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s6, s16, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v1, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 -; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -9838,41 +9828,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s6, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 -; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 -; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s6 -; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB29_1 @@ -9883,40 +9873,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff -; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 -; GFX90A-NEXT: v_mov_b32_e32 v3, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 @@ -9927,41 +9917,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s8, s20, 0x400 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff -; GFX908-NEXT: s_mov_b32 s10, 0x7060302 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s8 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 +; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v1, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB29_1 @@ -9972,42 +9962,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s8, s20, 0x400 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB29_1 @@ -10019,38 +10009,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10061,39 +10051,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10124,40 +10114,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s8, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff -; GFX940-NEXT: s_mov_b32 s10, 0x7060302 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX940-NEXT: v_add_f32_e32 v2, v5, v4 -; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s8 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10167,45 +10157,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s6, s16, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX11-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v5, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -10219,39 +10207,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s6, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v5, s6 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -10263,39 +10251,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff -; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX90A-NEXT: v_mov_b32_e32 v6, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10306,40 +10294,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s8, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff -; GFX908-NEXT: s_mov_b32 s10, 0x7060302 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v6, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: v_mov_b32_e32 v6, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10350,41 +10338,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s8, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v6, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10395,38 +10383,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10437,39 +10425,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10499,41 +10487,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s8, s16, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff -; GFX940-NEXT: s_mov_b32 s10, 0x7060302 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX940-NEXT: v_add_f32_e32 v1, v4, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s8 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB31_1 @@ -10544,45 +10532,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s6, s16, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v1, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 -; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -10595,41 +10585,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s6, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 -; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 -; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s6 -; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -10640,40 +10630,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff -; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 -; GFX90A-NEXT: v_mov_b32_e32 v3, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 @@ -10684,41 +10674,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s8, s20, 0x400 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff -; GFX908-NEXT: s_mov_b32 s10, 0x7060302 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s8 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 +; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v1, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 @@ -10729,42 +10719,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s8, s20, 0x400 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 @@ -10776,38 +10766,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10818,39 +10808,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10881,40 +10871,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s8, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff -; GFX940-NEXT: s_mov_b32 s10, 0x7060302 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX940-NEXT: v_add_f32_e32 v2, v5, v4 -; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s8 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10924,45 +10914,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s6, s16, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX11-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v5, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -10976,39 +10964,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s6, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v5, s6 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -11020,39 +11008,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff -; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX90A-NEXT: v_mov_b32_e32 v6, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11063,40 +11051,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s8, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff -; GFX908-NEXT: s_mov_b32 s10, 0x7060302 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v6, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: v_mov_b32_e32 v6, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11107,41 +11095,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s8, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v6, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11152,38 +11140,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11194,39 +11182,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11257,40 +11245,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s8, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff -; GFX940-NEXT: s_mov_b32 s10, 0x7060302 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX940-NEXT: v_add_f32_e32 v2, v5, v4 -; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s8 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11300,45 +11288,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s6, s16, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX11-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v5, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -11352,39 +11338,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s6, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v5, s6 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -11396,39 +11382,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff -; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX90A-NEXT: v_mov_b32_e32 v6, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11439,40 +11425,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s8, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff -; GFX908-NEXT: s_mov_b32 s10, 0x7060302 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v6, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: v_mov_b32_e32 v6, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11483,41 +11469,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s8, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v6, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11528,38 +11514,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11570,39 +11556,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11660,23 +11646,23 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB34_1 @@ -11692,12 +11678,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc @@ -11720,18 +11706,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB34_1 @@ -11747,18 +11733,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB34_1 @@ -11774,18 +11760,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 @@ -11801,19 +11787,19 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB34_1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index b00f8aecbd2f5..c7511a2df9fe1 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -31,19 +31,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -82,19 +82,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -110,25 +110,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 @@ -145,19 +145,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 @@ -207,24 +207,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -257,23 +257,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -284,24 +284,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -316,20 +316,20 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -400,7 +400,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -412,22 +412,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB2_4 Depth 2 -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v6, v9, v9 -; GFX940-NEXT: v_max_f32_e32 v8, v6, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v7, v7 +; GFX940-NEXT: v_max_f32_e32 v6, v4, v9 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 @@ -441,21 +441,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v9, v6 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB2_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -520,7 +520,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -532,22 +532,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9 -; GFX90A-NEXT: v_max_f32_e32 v8, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7 +; GFX90A-NEXT: v_max_f32_e32 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -559,27 +559,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -591,23 +591,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_max_f32_e32 v8, v5, v5 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 -; GFX908-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v6, v8, v8 -; GFX908-NEXT: v_max_f32_e32 v7, v6, v4 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: v_max_f32_e32 v4, v6, v6 +; GFX908-NEXT: v_max_f32_e32 v5, v4, v8 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -619,21 +619,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB2_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -771,19 +771,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -800,28 +800,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v1, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX11-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v0, v3 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -833,27 +832,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_max_f32_e32 v0, v2, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v1, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -864,19 +863,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -892,25 +891,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 @@ -927,19 +926,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 @@ -956,19 +955,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 @@ -985,20 +984,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX6-NEXT: v_max_f32_e32 v3, v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 @@ -1029,19 +1028,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -1080,19 +1079,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1108,25 +1107,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 @@ -1143,19 +1142,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 @@ -1198,29 +1197,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 -; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v10, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1243,30 +1243,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v10, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1298,29 +1298,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 -; GFX908-NEXT: v_mov_b32_e32 v2, v8 -; GFX908-NEXT: v_mov_b32_e32 v3, v9 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 @@ -1331,29 +1331,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v3, v9 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 @@ -1393,27 +1393,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 -; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, v2 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1437,28 +1437,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, v2 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1491,27 +1490,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v9, v5 -; GFX908-NEXT: v_mov_b32_e32 v8, v4 -; GFX908-NEXT: v_mov_b32_e32 v7, v3 -; GFX908-NEXT: v_mov_b32_e32 v6, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, v3 +; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_mov_b32_e32 v8, v1 +; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v6 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v7 +; GFX908-NEXT: v_mov_b32_e32 v3, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1522,27 +1521,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v9, v5 -; GFX8-NEXT: v_mov_b32_e32 v8, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, v3 -; GFX8-NEXT: v_mov_b32_e32 v6, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v6 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1606,17 +1605,17 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[5:6], v[5:6] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[13:14], v[13:14] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[2:3], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1710,17 +1709,17 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX11-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] +; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1] +; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1838,15 +1837,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX908-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] +; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1] +; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: v_mov_b32_e32 v2, v13 @@ -1904,15 +1903,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX8-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] +; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1] +; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: v_mov_b32_e32 v2, v13 @@ -2012,29 +2011,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 -; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v10, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2057,30 +2057,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v10, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2092,31 +2092,31 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_add_i32 s5, s20, 0x800 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v10, s5 +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v0, v6 -; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mov_b32_e32 v3, v9 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX10-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 @@ -2127,26 +2127,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v10, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] +; GFX90A-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 @@ -2157,29 +2157,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 -; GFX908-NEXT: v_mov_b32_e32 v2, v8 -; GFX908-NEXT: v_mov_b32_e32 v3, v9 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 @@ -2190,29 +2190,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v3, v9 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 @@ -2223,29 +2223,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX7-NEXT: v_mov_b32_e32 v10, s6 -; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-NEXT: v_mov_b32_e32 v3, v9 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX7-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 @@ -2256,30 +2256,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 +; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v9, v1 -; GFX6-NEXT: v_mov_b32_e32 v8, v0 +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX6-NEXT: v_mov_b32_e32 v10, s6 -; GFX6-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX6-NEXT: v_mov_b32_e32 v0, v6 -; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v3, v9 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX6-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 @@ -2300,29 +2300,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 -; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v10, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2345,30 +2346,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v10, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2400,29 +2401,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 -; GFX908-NEXT: v_mov_b32_e32 v2, v8 -; GFX908-NEXT: v_mov_b32_e32 v3, v9 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 @@ -2433,29 +2434,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v3, v9 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 @@ -2499,47 +2500,47 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v3 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2547,263 +2548,256 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v0, v0 -; GFX940-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX940-NEXT: v_max_f16_e32 v1, v1, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX940-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX11-NEXT: v_max_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 +; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX10-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX90A-NEXT: v_max_f16_e32 v1, v1, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX908-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2813,7 +2807,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2821,31 +2815,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2855,7 +2848,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2873,46 +2866,46 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2920,32 +2913,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v0, v0 -; GFX940-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX940-NEXT: v_max_f16_e32 v1, v1, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX940-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2956,114 +2948,111 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX11-NEXT: v_max_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 +; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX10-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX90A-NEXT: v_max_f16_e32 v1, v1, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3074,32 +3063,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX908-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3110,33 +3098,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3147,35 +3134,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3186,36 +3172,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3235,15 +3220,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v11, v7 +; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-NEXT: v_not_b32_e32 v9, v6 ; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3258,30 +3243,31 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX12-NEXT: v_max_num_f16_e32 v8, v5, v5 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v6, v6, v6 -; GFX12-NEXT: v_max_num_f16_e32 v6, v6, v8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -3297,15 +3283,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX12-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3313,7 +3299,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3321,12 +3307,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v11, v6 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -3338,24 +3324,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen +; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_max_f16_e32 v8, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v8 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v11 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3369,36 +3355,36 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX11-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v11, v7 +; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-NEXT: v_not_b32_e32 v9, v6 ; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -3410,29 +3396,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen +; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX11-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX11-NEXT: v_max_f16_e32 v6, v6, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-NEXT: v_max_f16_e32 v4, v4, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -3446,14 +3433,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3462,20 +3449,20 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v11, v7 +; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX10-NEXT: v_not_b32_e32 v9, v6 ; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -3485,26 +3472,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX10-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX10-NEXT: v_max_f16_e32 v6, v6, v8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v9, v7 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX10-NEXT: v_max_f16_e32 v4, v4, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 ; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -3516,15 +3503,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -3533,19 +3520,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v11, v6 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 +; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3557,24 +3544,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_max_f16_e32 v8, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v8 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v11 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3586,33 +3573,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v11, v6 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3624,25 +3611,25 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX908-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX908-NEXT: v_max_f16_e32 v8, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v6, v6, v8 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX908-NEXT: v_mov_b32_e32 v9, v7 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v10 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3654,33 +3641,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v11, v6 +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3692,26 +3679,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX8-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX8-NEXT: v_max_f16_e32 v8, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v6, v6, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_max_f16_e32 v4, v4, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3723,21 +3710,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v8 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3898,27 +3885,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -3928,23 +3916,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v2 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3952,33 +3940,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: v_mov_b32_e32 v5, s6 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s9 +; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3988,32 +3975,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4023,97 +4011,95 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, s6 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4123,39 +4109,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4165,41 +4150,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4209,38 +4193,37 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4250,7 +4233,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4258,32 +4241,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4293,7 +4275,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4311,52 +4293,53 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v3 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4364,33 +4347,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v0, s9 +; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4406,123 +4388,122 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v3 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4538,37 +4519,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4579,39 +4559,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4622,36 +4601,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4662,37 +4640,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5419,27 +5396,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 -; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 -; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v1, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5452,20 +5431,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: v_pk_max_f16 v4, v0, v1 ; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -5481,27 +5461,28 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 -; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v1, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5513,27 +5494,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v1, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -5544,19 +5525,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5572,25 +5553,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v0, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v0, v1 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX908-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 @@ -5601,29 +5582,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v6, v0 -; GFX8-NEXT: v_or_b32_e32 v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v6, v6 +; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -5646,30 +5627,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 @@ -5692,31 +5673,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 @@ -5738,25 +5719,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v1, v0, v0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_pk_max_num_f16 v1, v3, v1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5770,24 +5752,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5798,25 +5781,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_max_f16 v1, v3, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5829,25 +5812,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v1, v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -5859,23 +5842,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5886,24 +5869,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v1, v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5914,28 +5897,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v4, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v3, v5, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5946,41 +5929,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 @@ -5992,42 +5975,42 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_mov_b32_e32 v7, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 @@ -6048,7 +6031,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6064,26 +6047,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX12-NEXT: v_pk_max_num_f16 v4, v5, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v6, v8, v8 +; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v7, v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v6, v7 -; GFX12-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8 +; GFX12-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6099,15 +6082,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6115,14 +6098,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v6 +; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -6134,23 +6117,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v6, v9, v9 +; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_pk_max_f16 v8, v6, v4 +; GFX940-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -6163,27 +6146,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB18_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v9, v6 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6197,25 +6180,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX11-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v6, v8, v8 +; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v7, v6, v4 -; GFX11-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -6229,14 +6212,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -6245,13 +6228,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6263,24 +6246,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX10-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v6, v8, v8 +; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v7, v6, v4 -; GFX10-NEXT: v_mov_b32_e32 v6, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v8 +; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 ; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -6292,15 +6275,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -6309,13 +6292,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -6327,22 +6310,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9 -; GFX90A-NEXT: v_pk_max_f16 v8, v6, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 +; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -6354,27 +6337,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -6386,23 +6369,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX908-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v6, v8, v8 -; GFX908-NEXT: v_pk_max_f16 v7, v6, v4 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX908-NEXT: v_pk_max_f16 v5, v4, v8 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -6414,27 +6397,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -6446,27 +6429,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 ; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v6, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v7, v8, v8 -; GFX8-NEXT: v_max_f16_e32 v6, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_mov_b32_e32 v6, v7 +; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_max_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v9 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v7, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -6478,21 +6461,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6669,43 +6652,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 +; GFX12-NEXT: v_mov_b32_e32 v6, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_max_num_f32_e32 v1, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_max_num_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 -; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v5, s6 -; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 -; GFX12-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -6718,41 +6703,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s8, s16, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff -; GFX940-NEXT: s_mov_b32 s10, 0x7060302 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX940-NEXT: v_max_f32_e32 v0, v6, v0 -; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s8 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB19_1 @@ -6763,45 +6748,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s6, s16, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_max_f32_e32 v1, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 -; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -6814,41 +6801,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s6, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v0, v3, v0 -; GFX10-NEXT: v_max_f32_e32 v1, v5, v1 -; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s6 -; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -6859,40 +6846,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff -; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v6, v0 -; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 -; GFX90A-NEXT: v_mov_b32_e32 v3, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 @@ -6903,41 +6890,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s8, s20, 0x400 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff -; GFX908-NEXT: s_mov_b32 s10, 0x7060302 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v0 -; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s8 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 +; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v1, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 @@ -6948,42 +6935,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s8, s20, 0x400 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_max_f32_e32 v0, v6, v0 -; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 @@ -6995,38 +6982,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7037,39 +7024,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7089,43 +7076,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s16 -; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v1, v3, v1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v5, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7139,40 +7124,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s8, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff -; GFX940-NEXT: s_mov_b32 s10, 0x7060302 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX940-NEXT: v_max_f32_e32 v2, v5, v4 -; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s8 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7182,45 +7167,43 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s6, s16, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v1, v3, v1 -; GFX11-NEXT: v_max_f32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v5, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7234,39 +7217,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s6, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_max_f32_e32 v3, v5, v4 -; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v5, s6 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -7278,39 +7261,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff -; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX90A-NEXT: v_mov_b32_e32 v6, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7321,40 +7304,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s8, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff -; GFX908-NEXT: s_mov_b32 s10, 0x7060302 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX908-NEXT: v_max_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v4 -; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v6, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: v_mov_b32_e32 v6, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7365,41 +7348,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s8, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: v_max_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v4 -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v6, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7411,37 +7394,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7453,38 +7436,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v5, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7504,7 +7487,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -7520,43 +7503,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX12-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v4, v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v6, v10, v7 -; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8 +; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX12-NEXT: v_add3_u32 v10, v10, v6, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 -; GFX12-NEXT: v_mov_b32_e32 v6, v7 -; GFX12-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX12-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -7572,15 +7554,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7588,14 +7570,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v6 +; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -7607,40 +7589,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 ; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v9 +; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; GFX940-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX940-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX940-NEXT: v_add3_u32 v7, v7, v6, s10 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v10 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 +; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX940-NEXT: v_perm_b32 v8, v6, v4, s11 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -7653,27 +7635,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v9, v6 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -7687,41 +7669,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v6, v10, v7 -; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8 +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 -; GFX11-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -7735,14 +7718,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB21_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -7750,14 +7733,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -7769,38 +7753,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v10, v7 -; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v8 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v9 +; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo -; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 ; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -7812,15 +7796,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -7829,13 +7813,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7847,38 +7831,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 ; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v9 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; GFX90A-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v10 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7890,27 +7874,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7922,39 +7906,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX908-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX908-NEXT: v_max_f32_e32 v4, v4, v8 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GFX908-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc -; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v9 +; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 +; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7966,27 +7950,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -7998,40 +7982,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX8-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX8-NEXT: v_max_f32_e32 v4, v4, v8 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GFX8-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v9 +; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v7, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -8043,21 +8027,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -8236,19 +8220,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -8287,19 +8271,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc @@ -8317,25 +8301,25 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 @@ -8352,19 +8336,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 0d38ee36415b8..0bcaacc6b08e8 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -31,19 +31,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -82,19 +82,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -110,25 +110,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 @@ -145,19 +145,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 @@ -207,24 +207,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -257,23 +257,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -284,24 +284,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX908-NEXT: v_min_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -316,20 +316,20 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -400,7 +400,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -412,22 +412,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB2_4 Depth 2 -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v6, v9, v9 -; GFX940-NEXT: v_min_f32_e32 v8, v6, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v7, v7 +; GFX940-NEXT: v_min_f32_e32 v6, v4, v9 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 @@ -441,21 +441,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v9, v6 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB2_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -520,7 +520,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -532,22 +532,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9 -; GFX90A-NEXT: v_min_f32_e32 v8, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7 +; GFX90A-NEXT: v_min_f32_e32 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -559,27 +559,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -591,23 +591,23 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_max_f32_e32 v8, v5, v5 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 -; GFX908-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v6, v8, v8 -; GFX908-NEXT: v_min_f32_e32 v7, v6, v4 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: v_max_f32_e32 v4, v6, v6 +; GFX908-NEXT: v_min_f32_e32 v5, v4, v8 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -619,21 +619,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB2_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -771,19 +771,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -800,28 +800,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v1, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX11-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v0, v3 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -833,27 +832,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_max_f32_e32 v0, v2, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v1, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -864,19 +863,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -892,25 +891,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 @@ -927,19 +926,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 @@ -956,19 +955,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX7-NEXT: v_min_f32_e32 v3, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 @@ -985,20 +984,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX6-NEXT: v_min_f32_e32 v3, v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 @@ -1029,19 +1028,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -1080,19 +1079,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1108,25 +1107,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 @@ -1143,19 +1142,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 @@ -1198,29 +1197,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 -; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v10, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1243,30 +1243,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v10, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1298,29 +1298,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 -; GFX908-NEXT: v_mov_b32_e32 v2, v8 -; GFX908-NEXT: v_mov_b32_e32 v3, v9 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 @@ -1331,29 +1331,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v3, v9 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 @@ -1393,27 +1393,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 -; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, v2 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1437,28 +1437,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, v2 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1491,27 +1490,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v9, v5 -; GFX908-NEXT: v_mov_b32_e32 v8, v4 -; GFX908-NEXT: v_mov_b32_e32 v7, v3 -; GFX908-NEXT: v_mov_b32_e32 v6, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, v3 +; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_mov_b32_e32 v8, v1 +; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v6 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v7 +; GFX908-NEXT: v_mov_b32_e32 v3, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1522,27 +1521,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v9, v5 -; GFX8-NEXT: v_mov_b32_e32 v8, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, v3 -; GFX8-NEXT: v_mov_b32_e32 v6, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v6 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1606,17 +1605,17 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[5:6], v[5:6] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[13:14], v[13:14] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[2:3], v[0:1] +; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1710,17 +1709,17 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX11-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] +; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1] +; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1838,15 +1837,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX908-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] +; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1] +; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: v_mov_b32_e32 v2, v13 @@ -1904,15 +1903,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX8-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] +; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1] +; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: v_mov_b32_e32 v2, v13 @@ -2012,29 +2011,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 -; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v10, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2057,30 +2057,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v10, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2092,31 +2092,31 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_add_i32 s5, s20, 0x800 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v10, s5 +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v0, v6 -; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mov_b32_e32 v3, v9 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX10-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 @@ -2127,26 +2127,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v10, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] +; GFX90A-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 @@ -2157,29 +2157,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 -; GFX908-NEXT: v_mov_b32_e32 v2, v8 -; GFX908-NEXT: v_mov_b32_e32 v3, v9 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 @@ -2190,29 +2190,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v3, v9 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 @@ -2223,29 +2223,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX7-NEXT: v_mov_b32_e32 v10, s6 -; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-NEXT: v_mov_b32_e32 v3, v9 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX7-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 @@ -2256,30 +2256,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 +; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v9, v1 -; GFX6-NEXT: v_mov_b32_e32 v8, v0 +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX6-NEXT: v_mov_b32_e32 v10, s6 -; GFX6-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX6-NEXT: v_mov_b32_e32 v0, v6 -; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v3, v9 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX6-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 @@ -2300,29 +2300,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 -; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v10, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2345,30 +2346,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x800 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v10, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2400,29 +2401,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_mov_b32_e32 v10, s6 -; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 -; GFX908-NEXT: v_mov_b32_e32 v2, v8 -; GFX908-NEXT: v_mov_b32_e32 v3, v9 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 @@ -2433,29 +2434,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v3, v9 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 @@ -2499,47 +2500,47 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GFX12-NEXT: v_min_num_f16_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v3 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2547,263 +2548,256 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v0, v0 -; GFX940-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX940-NEXT: v_min_f16_e32 v1, v1, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX940-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX11-NEXT: v_min_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 +; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX10-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX90A-NEXT: v_min_f16_e32 v1, v1, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX908-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX908-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2813,7 +2807,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2821,31 +2815,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2855,7 +2848,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2873,46 +2866,46 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GFX12-NEXT: v_min_num_f16_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2920,32 +2913,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v0, v0 -; GFX940-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX940-NEXT: v_min_f16_e32 v1, v1, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX940-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2956,114 +2948,111 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX11-NEXT: v_min_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 +; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX10-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX90A-NEXT: v_min_f16_e32 v1, v1, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3074,32 +3063,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX908-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX908-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3110,33 +3098,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3147,35 +3134,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3186,36 +3172,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3235,15 +3220,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v11, v7 +; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-NEXT: v_not_b32_e32 v9, v6 ; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3258,30 +3243,31 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX12-NEXT: v_max_num_f16_e32 v8, v5, v5 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v6, v6, v6 -; GFX12-NEXT: v_min_num_f16_e32 v6, v6, v8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -3297,15 +3283,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX12-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3313,7 +3299,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3321,12 +3307,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v11, v6 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -3338,24 +3324,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen +; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_max_f16_e32 v8, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v6, v6, v8 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_min_f16_e32 v4, v4, v11 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3369,36 +3355,36 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX11-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v11, v7 +; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-NEXT: v_not_b32_e32 v9, v6 ; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -3410,29 +3396,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen +; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX11-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX11-NEXT: v_min_f16_e32 v6, v6, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-NEXT: v_min_f16_e32 v4, v4, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -3446,14 +3433,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3462,20 +3449,20 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v11, v7 +; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX10-NEXT: v_not_b32_e32 v9, v6 ; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -3485,26 +3472,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX10-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX10-NEXT: v_min_f16_e32 v6, v6, v8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v9, v7 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX10-NEXT: v_min_f16_e32 v4, v4, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 ; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -3516,15 +3503,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -3533,19 +3520,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v11, v6 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 +; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3557,24 +3544,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_max_f16_e32 v8, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v6, v6, v8 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v11 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3586,33 +3573,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v11, v6 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3624,25 +3611,25 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX908-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX908-NEXT: v_max_f16_e32 v8, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v6, v6, v8 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX908-NEXT: v_mov_b32_e32 v9, v7 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX908-NEXT: v_min_f16_e32 v4, v4, v10 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3654,33 +3641,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v11, v6 +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3692,26 +3679,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX8-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX8-NEXT: v_max_f16_e32 v8, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v6, v6, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_min_f16_e32 v4, v4, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3723,21 +3710,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v8 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3898,27 +3885,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -3928,23 +3916,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v2 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3952,33 +3940,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: v_mov_b32_e32 v5, s6 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s9 +; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3988,32 +3975,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4023,97 +4011,95 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, s6 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4123,39 +4109,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4165,41 +4150,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4209,38 +4193,37 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4250,7 +4233,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4258,32 +4241,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4293,7 +4275,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4311,52 +4293,53 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_lshl_b32 s5, s5, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s7, s6 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v3 -; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4364,33 +4347,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s6, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s7, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX940-NEXT: s_not_b32 s8, s4 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v0, s9 +; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4406,123 +4388,122 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_and_b32 s5, s16, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_and_b32 s4, s16, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v3 -; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: s_and_b32 s5, s20, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen -; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s6, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX90A-NEXT: s_not_b32 s8, s4 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4538,37 +4519,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s6, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v1, s6 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s7, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX908-NEXT: s_not_b32 s8, s4 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4579,39 +4559,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s6, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s7, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 -; GFX8-NEXT: s_not_b32 s8, s4 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4622,36 +4601,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s6, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s7, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s8, s4 +; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4662,37 +4640,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s6, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s7, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s8, s4 +; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5419,27 +5396,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 -; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 -; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v1, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5452,20 +5431,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: v_pk_min_f16 v4, v0, v1 ; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -5481,27 +5461,28 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 -; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v1, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5513,27 +5494,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v1, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -5544,19 +5525,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5572,25 +5553,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v0, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v0, v1 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX908-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 @@ -5601,29 +5582,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v6, v0 -; GFX8-NEXT: v_or_b32_e32 v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v6, v6 +; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -5646,30 +5627,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 @@ -5692,31 +5673,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 @@ -5738,25 +5719,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 -; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v1, v0, v0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_pk_min_num_f16 v1, v3, v1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5770,24 +5752,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX940-NEXT: v_mov_b32_e32 v6, s6 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5798,25 +5781,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s5, s16, 0x400 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_min_f16 v1, v3, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5829,25 +5812,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s5, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_f16 v1, v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -5859,23 +5842,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5886,24 +5869,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v1, v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5914,28 +5897,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v4, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_min_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v3, v5, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5946,41 +5929,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 @@ -5992,42 +5975,42 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_mov_b32_e32 v7, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 @@ -6048,7 +6031,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6064,26 +6047,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX12-NEXT: v_pk_max_num_f16 v4, v5, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v6, v8, v8 +; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v7, v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v6, v7 -; GFX12-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8 +; GFX12-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6099,15 +6082,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6115,14 +6098,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v6 +; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -6134,23 +6117,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v6, v9, v9 +; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_pk_min_f16 v8, v6, v4 +; GFX940-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -6163,27 +6146,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB18_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v9, v6 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6197,25 +6180,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX11-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v6, v8, v8 +; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v7, v6, v4 -; GFX11-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-NEXT: v_pk_min_f16 v5, v4, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -6229,14 +6212,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -6245,13 +6228,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6263,24 +6246,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX10-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v6, v8, v8 +; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_f16 v7, v6, v4 -; GFX10-NEXT: v_mov_b32_e32 v6, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: v_pk_min_f16 v5, v4, v8 +; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 ; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -6292,15 +6275,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -6309,13 +6292,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -6327,22 +6310,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9 -; GFX90A-NEXT: v_pk_min_f16 v8, v6, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 +; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -6354,27 +6337,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -6386,23 +6369,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX908-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v6, v8, v8 -; GFX908-NEXT: v_pk_min_f16 v7, v6, v4 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX908-NEXT: v_pk_min_f16 v5, v4, v8 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -6414,27 +6397,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -6446,27 +6429,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 ; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v6, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v7, v8, v8 -; GFX8-NEXT: v_min_f16_e32 v6, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_mov_b32_e32 v6, v7 +; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_min_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v9 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v7, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -6478,21 +6461,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6669,43 +6652,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 +; GFX12-NEXT: v_mov_b32_e32 v6, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX12-NEXT: v_min_num_f32_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_min_num_f32_e32 v1, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_min_num_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 -; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v5, s6 -; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 -; GFX12-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -6718,41 +6703,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s8, s16, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff -; GFX940-NEXT: s_mov_b32 s10, 0x7060302 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v1, v4, v1 -; GFX940-NEXT: v_min_f32_e32 v0, v6, v0 -; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s8 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB19_1 @@ -6763,45 +6748,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s6, s16, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_min_f32_e32 v1, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 -; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -6814,41 +6801,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s6, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v0, v3, v0 -; GFX10-NEXT: v_min_f32_e32 v1, v5, v1 -; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s6 -; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -6859,40 +6846,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff -; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v1, v4, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v6, v0 -; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 -; GFX90A-NEXT: v_mov_b32_e32 v3, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 @@ -6903,41 +6890,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s8, s20, 0x400 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff -; GFX908-NEXT: s_mov_b32 s10, 0x7060302 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v6, v0 -; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s8 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 +; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v1, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 @@ -6948,42 +6935,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s8, s20, 0x400 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_min_f32_e32 v0, v6, v0 -; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 @@ -6995,38 +6982,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7037,39 +7024,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7089,43 +7076,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s16 -; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v1, v3, v1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v5, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7139,40 +7124,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s8, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: s_movk_i32 s9, 0x7fff -; GFX940-NEXT: s_mov_b32 s10, 0x7060302 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX940-NEXT: v_min_f32_e32 v2, v5, v4 -; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v6, s8 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7182,45 +7167,43 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s6, s16, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v1, v3, v1 -; GFX11-NEXT: v_min_f32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v5, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7234,39 +7217,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s6, s20, 0x400 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_min_f32_e32 v3, v5, v4 -; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v5, s6 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -7278,39 +7261,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: s_movk_i32 s9, 0x7fff -; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX90A-NEXT: v_min_f32_e32 v2, v5, v4 -; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 -; GFX90A-NEXT: v_mov_b32_e32 v6, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7321,40 +7304,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s8, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: s_movk_i32 s9, 0x7fff -; GFX908-NEXT: s_mov_b32 s10, 0x7060302 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX908-NEXT: v_min_f32_e32 v1, v3, v1 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v4 -; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v6, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: v_mov_b32_e32 v6, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7365,41 +7348,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s8, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: v_min_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v4 -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v6, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7411,37 +7394,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7453,38 +7436,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v5, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7504,7 +7487,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -7520,43 +7503,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX12-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_min_num_f32_e32 v4, v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v6, v10, v7 -; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8 +; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX12-NEXT: v_add3_u32 v10, v10, v6, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 -; GFX12-NEXT: v_mov_b32_e32 v6, v7 -; GFX12-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX12-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -7572,15 +7554,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX12-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7588,14 +7570,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v6 +; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -7607,40 +7589,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 ; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v9 +; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; GFX940-NEXT: v_min_f32_e32 v6, v7, v6 -; GFX940-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX940-NEXT: v_add3_u32 v7, v7, v6, s10 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v10 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 +; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX940-NEXT: v_perm_b32 v8, v6, v4, s11 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -7653,27 +7635,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v9, v6 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -7687,41 +7669,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v6, v10, v7 -; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8 +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 -; GFX11-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -7735,14 +7718,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB21_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -7750,14 +7733,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -7769,38 +7753,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v10, v7 -; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v4, v4, v8 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v9 +; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo -; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 ; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -7812,15 +7796,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -7829,13 +7813,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7847,38 +7831,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 ; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v9 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; GFX90A-NEXT: v_min_f32_e32 v6, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v10 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7890,27 +7874,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7922,39 +7906,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX908-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX908-NEXT: v_min_f32_e32 v4, v4, v8 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GFX908-NEXT: v_min_f32_e32 v6, v7, v6 -; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc -; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v9 +; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 +; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7966,27 +7950,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -7998,40 +7982,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX8-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX8-NEXT: v_min_f32_e32 v4, v4, v8 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GFX8-NEXT: v_min_f32_e32 v6, v7, v6 -; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v9 +; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v7, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -8043,21 +8027,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -8236,19 +8220,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -8287,19 +8271,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc @@ -8317,25 +8301,25 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 @@ -8352,19 +8336,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll index 28a2245406842..3216e71e6221a 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll @@ -245,14 +245,14 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) { ; GISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split ; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 ; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base +; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 +; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 ; GISEL-ASM-NEXT: .LBB7_3: ; %finally ; GISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[4:5] -; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5] +; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] ; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2 ; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) ; GISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index a8ddece611783..9104dc68eb9b4 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -6,90 +6,95 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] ; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1] -; CHECK-NEXT: s_load_dword s4, s[8:9], 0x0 +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: s_load_dword s12, s[8:9], 0x4 +; CHECK-NEXT: s_load_dword s6, s[8:9], 0x4 ; CHECK-NEXT: s_add_u32 s24, s24, s15 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_bitcmp1_b32 s4, 0 +; CHECK-NEXT: s_bitcmp1_b32 s2, 0 +; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 +; CHECK-NEXT: s_bitcmp1_b32 s2, 8 +; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 +; CHECK-NEXT: s_bitcmp1_b32 s2, 16 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s4, 8 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s4, 16 -; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s0, 24 -; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 -; CHECK-NEXT: s_xor_b64 s[14:15], s[4:5], -1 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 +; CHECK-NEXT: s_xor_b64 s[4:5], s[8:9], -1 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 -; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s12, 8 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CHECK-NEXT: s_and_b64 s[0:1], exec, s[14:15] +; CHECK-NEXT: s_bitcmp1_b32 s6, 8 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17] +; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 +; CHECK-NEXT: s_and_b64 s[4:5], exec, s[4:5] +; CHECK-NEXT: s_and_b64 s[6:7], exec, s[10:11] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .LBB0_1: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[16:17], 0 -; CHECK-NEXT: s_mov_b64 s[18:19], -1 -; CHECK-NEXT: s_mov_b64 s[14:15], -1 +; CHECK-NEXT: s_mov_b64 s[18:19], 0 ; CHECK-NEXT: s_mov_b64 s[20:21], -1 +; CHECK-NEXT: s_mov_b64 s[16:17], -1 +; CHECK-NEXT: s_mov_b64 s[22:23], -1 ; CHECK-NEXT: .LBB0_2: ; %Flow7 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[20:21] +; CHECK-NEXT: s_and_b64 vcc, exec, s[22:23] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_12 ; CHECK-NEXT: .LBB0_3: ; %bb7 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; CHECK-NEXT: s_and_b64 vcc, exec, s[2:3] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.4: ; %bb8 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 vcc, s[0:1] +; CHECK-NEXT: s_mov_b64 vcc, s[4:5] ; CHECK-NEXT: s_cbranch_vccz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb9 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[14:15], 0 -; CHECK-NEXT: s_mov_b64 s[16:17], -1 -; CHECK-NEXT: s_mov_b64 s[20:21], s[6:7] +; CHECK-NEXT: s_mov_b64 s[16:17], 0 +; CHECK-NEXT: s_mov_b64 s[18:19], -1 +; CHECK-NEXT: s_mov_b64 s[22:23], s[10:11] ; CHECK-NEXT: s_cbranch_execz .LBB0_7 ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_6: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[14:15], -1 -; CHECK-NEXT: s_mov_b64 s[16:17], 0 -; CHECK-NEXT: s_mov_b64 s[20:21], 0 +; CHECK-NEXT: s_mov_b64 s[16:17], -1 +; CHECK-NEXT: s_mov_b64 s[18:19], 0 +; CHECK-NEXT: s_mov_b64 s[22:23], 0 ; CHECK-NEXT: .LBB0_7: ; %bb10 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[16:17], -1 -; CHECK-NEXT: s_mov_b64 s[14:15], 0 -; CHECK-NEXT: s_mov_b64 s[20:21], s[12:13] +; CHECK-NEXT: s_mov_b64 s[18:19], -1 +; CHECK-NEXT: s_mov_b64 s[16:17], 0 +; CHECK-NEXT: s_mov_b64 s[22:23], s[14:15] ; CHECK-NEXT: .LBB0_8: ; %Flow9 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[18:19], -1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; CHECK-NEXT: s_mov_b64 s[20:21], -1 +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[22:23] +; CHECK-NEXT: s_mov_b64 s[22:23], -1 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: ; %bb.9: ; %bb13 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[14:15], 0 -; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] -; CHECK-NEXT: s_mov_b64 s[18:19], 0 +; CHECK-NEXT: s_mov_b64 s[16:17], 0 +; CHECK-NEXT: s_mov_b64 s[20:21], 0 +; CHECK-NEXT: s_mov_b64 vcc, s[6:7] ; CHECK-NEXT: s_cbranch_vccz .LBB0_11 ; CHECK-NEXT: ; %bb.10: ; %bb16 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[18:19], -1 -; CHECK-NEXT: s_mov_b64 s[20:21], s[10:11] +; CHECK-NEXT: s_mov_b64 s[20:21], -1 +; CHECK-NEXT: s_mov_b64 s[22:23], s[12:13] ; CHECK-NEXT: .LBB0_11: ; %Flow11 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[16:17], 0 +; CHECK-NEXT: s_mov_b64 s[18:19], 0 ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_12: ; %loop.exit.guard6 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_xor_b64 s[20:21], s[18:19], -1 -; CHECK-NEXT: s_mov_b64 s[18:19], -1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[20:21] +; CHECK-NEXT: s_xor_b64 s[22:23], s[20:21], -1 +; CHECK-NEXT: s_mov_b64 s[20:21], -1 +; CHECK-NEXT: s_and_b64 vcc, exec, s[22:23] ; CHECK-NEXT: s_cbranch_vccz .LBB0_16 ; CHECK-NEXT: ; %bb.13: ; %bb14 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_15 ; CHECK-NEXT: ; %bb.14: ; %bb15 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 @@ -97,22 +102,22 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; CHECK-NEXT: .LBB0_15: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[18:19], 0 +; CHECK-NEXT: s_mov_b64 s[20:21], 0 ; CHECK-NEXT: .LBB0_16: ; %Flow13 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[18:19] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_3 ; CHECK-NEXT: ; %bb.17: ; %loop.exit.guard -; CHECK-NEXT: s_and_b64 vcc, exec, s[14:15] +; CHECK-NEXT: s_and_b64 vcc, exec, s[16:17] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_22 ; CHECK-NEXT: ; %bb.18: ; %loop.exit.guard5 -; CHECK-NEXT: s_and_b64 vcc, exec, s[16:17] +; CHECK-NEXT: s_and_b64 vcc, exec, s[18:19] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_23 ; CHECK-NEXT: ; %bb.19: ; %bb17 -; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] ; CHECK-NEXT: s_cbranch_vccz .LBB0_21 ; CHECK-NEXT: ; %bb.20: ; %bb19 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccz .LBB0_22 ; CHECK-NEXT: .LBB0_21: ; %bb18 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 4281a4b74cbae..ff48a3fc98018 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -16515,39 +16515,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16562,34 +16564,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB68_1 @@ -16603,33 +16605,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB68_1 @@ -16643,33 +16645,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB68_1 @@ -16683,34 +16685,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB68_1 @@ -16788,39 +16790,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16836,35 +16840,35 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB69_1 @@ -16877,33 +16881,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 @@ -16917,33 +16921,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB69_1 @@ -16959,34 +16963,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB69_1 @@ -17074,38 +17078,40 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: v_dual_add_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17120,35 +17126,35 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB70_1 @@ -17165,33 +17171,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX90A-NEXT: v_add_f32_e32 v0, v7, v0 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 @@ -17208,33 +17214,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v0, v7, v0 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB70_1 @@ -17249,34 +17255,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB70_1 @@ -17353,41 +17359,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17400,35 +17406,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB71_1 @@ -17439,36 +17445,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB71_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17478,36 +17484,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB71_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17517,37 +17523,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB71_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17620,41 +17626,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17669,35 +17675,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB72_1 @@ -17708,36 +17714,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB72_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17747,36 +17753,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB72_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17788,37 +17794,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB72_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17901,41 +17907,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v4, v[3:4] +; GFX11-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17950,35 +17956,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB73_1 @@ -17995,28 +18001,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v3, v0 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -18038,28 +18044,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v5, v0 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -18077,37 +18083,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB73_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18185,39 +18191,41 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18233,35 +18241,35 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB74_1 @@ -18274,35 +18282,35 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB74_1 @@ -18316,33 +18324,33 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB74_1 @@ -18358,34 +18366,34 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB74_1 @@ -18463,41 +18471,41 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18512,35 +18520,35 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB75_1 @@ -18551,38 +18559,38 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB75_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18592,36 +18600,36 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB75_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18633,37 +18641,37 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB75_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18740,39 +18748,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18787,34 +18797,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB76_1 @@ -18828,33 +18838,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB76_1 @@ -18868,33 +18878,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB76_1 @@ -18908,34 +18918,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB76_1 @@ -19012,41 +19022,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19059,35 +19069,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB77_1 @@ -19098,36 +19108,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB77_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19137,36 +19147,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB77_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19176,37 +19186,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB77_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19280,39 +19290,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19327,34 +19339,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB78_1 @@ -19368,33 +19380,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 @@ -19408,33 +19420,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB78_1 @@ -19448,34 +19460,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB78_1 @@ -19552,41 +19564,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19599,35 +19611,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB79_1 @@ -19638,36 +19650,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19677,36 +19689,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB79_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19716,37 +19728,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB79_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 36d7201c6fe46..36aa73fbf8e92 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -31,13 +31,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -76,13 +76,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -100,13 +100,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -173,13 +173,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -220,13 +220,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -244,13 +244,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -329,18 +329,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_1 @@ -381,17 +381,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 @@ -408,17 +408,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 +; GFX908-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 @@ -482,21 +482,21 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -528,20 +528,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -551,20 +551,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -622,21 +622,21 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -670,20 +670,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -693,20 +693,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -773,23 +773,23 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] +; GFX940-NEXT: flat_load_dword v3, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -831,12 +831,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -858,12 +858,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -935,13 +935,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -982,13 +982,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1008,13 +1008,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1084,21 +1084,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1132,22 +1132,22 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1157,20 +1157,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1235,13 +1235,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1259,14 +1259,15 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1286,14 +1287,14 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1313,13 +1314,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1337,13 +1338,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1426,13 +1427,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1471,13 +1472,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1495,13 +1496,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1572,13 +1573,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1617,13 +1618,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1641,13 +1642,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1714,13 +1715,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1761,13 +1762,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1785,13 +1786,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1870,18 +1871,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 @@ -1922,17 +1923,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 @@ -1949,17 +1950,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 +; GFX908-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -2023,21 +2024,21 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2069,20 +2070,20 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2092,20 +2093,20 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2163,21 +2164,21 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2211,20 +2212,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2234,20 +2235,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2314,23 +2315,23 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] +; GFX940-NEXT: flat_load_dword v3, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2372,12 +2373,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2399,12 +2400,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2476,13 +2477,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2523,13 +2524,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2549,13 +2550,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2625,21 +2626,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2673,22 +2674,22 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2698,20 +2699,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2769,29 +2770,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2800,25 +2801,24 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2865,29 +2865,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -2895,23 +2895,22 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: .LBB18_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3001,91 +3000,90 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB18_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: .LBB18_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB18_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[4:5] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: .LBB18_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_6 @@ -3093,18 +3091,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3162,6 +3159,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3188,10 +3186,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3210,7 +3207,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3266,6 +3262,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3288,10 +3285,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3310,7 +3306,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB19_2 ; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3408,6 +3403,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3430,9 +3426,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3448,7 +3443,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB19_2 ; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3464,6 +3458,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 @@ -3491,9 +3486,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3509,7 +3503,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB19_2 ; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -3578,6 +3571,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3604,10 +3598,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3626,7 +3619,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3683,6 +3675,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3705,10 +3698,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3727,7 +3719,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3825,6 +3816,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3847,9 +3839,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3865,7 +3856,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB20_2 ; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3881,6 +3871,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -3908,9 +3899,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3926,7 +3916,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -3995,6 +3984,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4011,21 +4001,20 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4034,19 +4023,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4092,6 +4080,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 @@ -4104,22 +4093,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -4127,19 +4115,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB21_2 ; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4225,6 +4212,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4237,40 +4225,38 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB21_2 ; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4278,6 +4264,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4292,44 +4279,42 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[4:5] -; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[2:3] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB21_2 ; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4385,13 +4370,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4404,21 +4390,20 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4426,20 +4411,19 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4487,12 +4471,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4502,42 +4487,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB22_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4627,10 +4610,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_3 @@ -4641,40 +4625,38 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB22_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB22_2 ; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4682,12 +4664,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_3 @@ -4698,44 +4681,42 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[4:5] -; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB22_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4794,13 +4775,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4813,21 +4795,20 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4835,20 +4816,19 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4897,12 +4877,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB23_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4912,42 +4893,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB23_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB23_2 ; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5037,10 +5016,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_3 @@ -5051,40 +5031,38 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB23_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB23_2 ; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -5092,12 +5070,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_3 @@ -5108,44 +5087,42 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[4:5] -; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB23_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB23_2 ; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5204,29 +5181,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5235,25 +5212,24 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5300,29 +5276,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5330,78 +5306,76 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: .LBB24_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_4 ; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX10-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_2 ; GFX10-NEXT: ; %bb.3: ; %Flow ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-NEXT: .LBB24_4: ; %Flow2 ; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_6 ; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -5409,140 +5383,138 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB24_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB24_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: .LBB24_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB24_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[4:5] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB24_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: .LBB24_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_6 @@ -5550,18 +5522,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5570,37 +5541,37 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_4 ; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[4:5] -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[2:3] +; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB24_2 ; GFX7-NEXT: ; %bb.3: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX7-NEXT: .LBB24_4: ; %Flow2 ; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_6 @@ -5608,18 +5579,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen -; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -5634,29 +5604,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5665,25 +5635,24 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5730,29 +5699,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5760,23 +5729,22 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: .LBB25_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -5866,91 +5834,90 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB25_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB25_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: .LBB25_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB25_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[4:5] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB25_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: .LBB25_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_6 @@ -5958,18 +5925,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6032,8 +5998,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] @@ -6045,12 +6012,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6083,14 +6049,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6110,8 +6076,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] @@ -6123,12 +6090,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6152,6 +6118,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6163,10 +6130,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6195,14 +6161,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6229,14 +6195,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6263,17 +6229,17 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6336,10 +6302,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6349,12 +6316,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6389,14 +6355,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6417,10 +6383,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6430,12 +6397,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6460,9 +6426,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6471,10 +6438,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6504,14 +6470,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6539,14 +6505,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6574,17 +6540,17 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6649,10 +6615,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6662,12 +6629,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6703,14 +6669,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6731,10 +6697,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6744,12 +6711,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6774,9 +6740,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6785,10 +6752,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6818,14 +6784,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6853,14 +6819,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6888,17 +6854,17 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6962,8 +6928,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] @@ -6975,10 +6942,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7012,13 +6978,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -7038,8 +7004,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: flat_load_b32 v4, v[0:1] @@ -7051,10 +7018,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7079,6 +7045,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -7090,9 +7057,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7121,13 +7087,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc @@ -7154,13 +7120,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc @@ -7187,16 +7153,16 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7255,36 +7221,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7302,29 +7268,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7334,37 +7300,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7376,31 +7342,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -7411,31 +7377,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7445,31 +7411,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7479,32 +7445,32 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7559,36 +7525,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7607,29 +7573,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7639,37 +7605,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7681,31 +7647,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -7716,31 +7682,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7750,31 +7716,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7784,32 +7750,32 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7865,15 +7831,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7898,14 +7864,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v3, v3, v4 +; GFX940-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 @@ -7924,15 +7890,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX11-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7956,23 +7922,23 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX10-NEXT: v_max_f16_e32 v0, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v6, v6 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -7985,14 +7951,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8011,14 +7977,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8039,19 +8005,19 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -8101,24 +8067,24 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8132,23 +8098,23 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v3, v4, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8158,25 +8124,25 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v5, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8190,23 +8156,23 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v5, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -8217,22 +8183,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v3, v4, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8242,22 +8208,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8269,22 +8235,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8334,10 +8300,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8347,12 +8314,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8388,14 +8354,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -8416,10 +8382,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8429,12 +8396,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8459,9 +8425,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8470,10 +8437,9 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8503,14 +8469,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -8540,14 +8506,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -8575,17 +8541,17 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -8648,37 +8614,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8696,29 +8662,29 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8728,37 +8694,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8770,31 +8736,31 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -8805,33 +8771,33 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8841,31 +8807,31 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8875,32 +8841,32 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12308,15 +12274,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12338,14 +12304,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12363,15 +12329,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12391,14 +12357,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12418,13 +12384,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12442,13 +12408,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12466,21 +12432,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v3, v7, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -12544,15 +12510,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12574,14 +12540,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12599,15 +12565,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12628,21 +12594,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 -; GFX10-NEXT: v_pk_max_f16 v0, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 +; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB47_1 @@ -12655,13 +12621,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12679,13 +12645,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12705,21 +12671,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -12783,15 +12749,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12821,19 +12787,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_max_f16 v0, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX940-NEXT: v_pk_max_f16 v2, v0, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB48_1 @@ -12852,20 +12818,20 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v1, v1 -; GFX11-NEXT: v_pk_max_f16 v0, v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v6, v6 +; GFX11-NEXT: v_pk_max_f16 v5, v0, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12879,21 +12845,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 -; GFX10-NEXT: v_pk_max_f16 v0, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 +; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB48_1 @@ -12910,17 +12876,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v0, v0, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 @@ -12937,17 +12903,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v0, v0, v5 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_pk_max_f16 v0, v6, v6 +; GFX908-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB48_1 @@ -12962,21 +12928,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -13039,21 +13005,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13067,22 +13033,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13092,22 +13058,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13119,21 +13085,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -13144,20 +13110,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13167,20 +13133,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13190,24 +13156,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13266,21 +13232,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13294,22 +13260,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13319,22 +13285,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13348,21 +13314,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -13373,20 +13339,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13396,20 +13362,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13421,24 +13387,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13500,21 +13466,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13532,24 +13498,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] +; GFX940-NEXT: flat_load_dword v3, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13563,22 +13529,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v4, v[3:4] +; GFX11-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13592,21 +13558,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -13623,12 +13589,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v0, v3, v0 +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -13650,12 +13616,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v0, v5, v0 +; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13673,24 +13639,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13753,15 +13719,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13784,14 +13750,14 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13809,15 +13775,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13838,21 +13804,21 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 -; GFX10-NEXT: v_pk_max_f16 v0, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 +; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -13865,13 +13831,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13891,13 +13857,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13917,21 +13883,21 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -13994,22 +13960,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14023,22 +13989,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14048,22 +14014,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14077,21 +14043,21 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -14102,22 +14068,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14127,20 +14093,20 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14152,24 +14118,24 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14236,36 +14202,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14282,35 +14250,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 @@ -14323,39 +14291,41 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14370,34 +14340,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -14411,33 +14381,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -14451,33 +14421,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -14491,34 +14461,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -14578,36 +14548,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14624,35 +14596,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 @@ -14665,39 +14637,41 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14713,35 +14687,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_max_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -14754,33 +14728,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -14794,33 +14768,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -14836,34 +14810,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -14923,36 +14897,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14977,35 +14953,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX940-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX940-NEXT: v_max_f32_e32 v0, v7, v0 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX940-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX940-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v0, v3, v0, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -15024,38 +15000,40 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: v_dual_max_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v5, v7, v5 -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15070,35 +15048,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_max_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -15115,33 +15093,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX90A-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX90A-NEXT: v_max_f32_e32 v0, v7, v0 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -15158,33 +15136,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_max_f32_e32 v0, v7, v0 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -15199,34 +15177,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -15285,38 +15263,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15330,38 +15308,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15371,41 +15349,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15418,35 +15396,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -15457,36 +15435,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15496,36 +15474,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15535,37 +15513,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15620,38 +15598,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15665,38 +15643,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15706,41 +15684,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15755,35 +15733,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -15794,36 +15772,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15833,36 +15811,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15874,37 +15852,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15962,38 +15940,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16011,40 +15989,40 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] +; GFX940-NEXT: flat_load_dword v3, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16058,41 +16036,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v4, v[3:4] +; GFX11-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16107,35 +16085,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -16152,28 +16130,28 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v3, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -16195,28 +16173,28 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v0 -; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -16234,37 +16212,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16323,37 +16301,39 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16370,35 +16350,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB60_1 @@ -16411,39 +16391,41 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16459,35 +16441,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_max_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -16500,35 +16482,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -16542,33 +16524,33 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -16584,34 +16566,34 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -16670,39 +16652,39 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16716,38 +16698,38 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16757,41 +16739,41 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16806,35 +16788,35 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -16845,38 +16827,38 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16886,36 +16868,36 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16927,37 +16909,37 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index daa3df680e5ca..d96d3db9f005d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -31,13 +31,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -76,13 +76,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -100,13 +100,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -173,13 +173,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -220,13 +220,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -244,13 +244,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -329,18 +329,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_1 @@ -381,17 +381,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 @@ -408,17 +408,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 +; GFX908-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 @@ -482,21 +482,21 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -528,20 +528,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -551,20 +551,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -622,21 +622,21 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -670,20 +670,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -693,20 +693,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -773,23 +773,23 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] +; GFX940-NEXT: flat_load_dword v3, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -831,12 +831,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -858,12 +858,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -935,13 +935,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -982,13 +982,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1008,13 +1008,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1084,21 +1084,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1132,22 +1132,22 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1157,20 +1157,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1235,13 +1235,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1259,14 +1259,15 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1286,14 +1287,14 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1313,13 +1314,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1337,13 +1338,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1426,13 +1427,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1471,13 +1472,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1495,13 +1496,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1572,13 +1573,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1617,13 +1618,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1641,13 +1642,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1714,13 +1715,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1761,13 +1762,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1785,13 +1786,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1870,18 +1871,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 @@ -1922,17 +1923,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 @@ -1949,17 +1950,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 +; GFX908-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -2023,21 +2024,21 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2069,20 +2070,20 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2092,20 +2093,20 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2163,21 +2164,21 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2211,20 +2212,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2234,20 +2235,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2314,23 +2315,23 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] +; GFX940-NEXT: flat_load_dword v3, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2372,12 +2373,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2399,12 +2400,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2476,13 +2477,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2523,13 +2524,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2549,13 +2550,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2625,21 +2626,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2673,22 +2674,22 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2698,20 +2699,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2769,29 +2770,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2800,25 +2801,24 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2865,29 +2865,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -2895,23 +2895,22 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: .LBB18_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3001,91 +3000,90 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB18_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: .LBB18_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB18_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[4:5] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: .LBB18_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_6 @@ -3093,18 +3091,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3162,6 +3159,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3188,10 +3186,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[6:7], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3210,7 +3207,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3266,6 +3262,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3288,10 +3285,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3310,7 +3306,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB19_2 ; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3408,6 +3403,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3430,9 +3426,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3448,7 +3443,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB19_2 ; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3464,6 +3458,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 @@ -3491,9 +3486,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3509,7 +3503,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB19_2 ; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -3578,6 +3571,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3604,10 +3598,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[6:7], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3626,7 +3619,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3683,6 +3675,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3705,10 +3698,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3727,7 +3719,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3825,6 +3816,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3847,9 +3839,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3865,7 +3856,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB20_2 ; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3881,6 +3871,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -3908,9 +3899,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3926,7 +3916,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -3995,6 +3984,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4011,21 +4001,20 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4034,19 +4023,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4092,6 +4080,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 @@ -4104,22 +4093,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -4127,19 +4115,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB21_2 ; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4225,6 +4212,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4237,40 +4225,38 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB21_2 ; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4278,6 +4264,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4292,44 +4279,42 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[4:5] -; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[2:3] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB21_2 ; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4385,13 +4370,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4404,21 +4390,20 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4426,20 +4411,19 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4487,12 +4471,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4502,42 +4487,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB22_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4627,10 +4610,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_3 @@ -4641,40 +4625,38 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB22_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB22_2 ; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4682,12 +4664,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_3 @@ -4698,44 +4681,42 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[4:5] -; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB22_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4794,13 +4775,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4813,21 +4795,20 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4835,20 +4816,19 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4897,12 +4877,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB23_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4912,42 +4893,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB23_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB23_2 ; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5037,10 +5016,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_3 @@ -5051,40 +5031,38 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB23_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB23_2 ; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -5092,12 +5070,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_3 @@ -5108,44 +5087,42 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[4:5] -; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB23_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB23_2 ; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5204,29 +5181,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5235,25 +5212,24 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5300,29 +5276,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5330,78 +5306,76 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: .LBB24_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_4 ; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX10-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_2 ; GFX10-NEXT: ; %bb.3: ; %Flow ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-NEXT: .LBB24_4: ; %Flow2 ; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_6 ; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX10-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -5409,140 +5383,138 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX90A-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB24_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB24_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: .LBB24_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB24_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[4:5] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB24_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: .LBB24_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_6 @@ -5550,18 +5522,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5570,37 +5541,37 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_4 ; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[4:5] -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[2:3] +; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX7-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB24_2 ; GFX7-NEXT: ; %bb.3: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX7-NEXT: .LBB24_4: ; %Flow2 ; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_6 @@ -5608,18 +5579,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen -; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -5634,29 +5604,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5665,25 +5635,24 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5730,29 +5699,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5760,23 +5729,22 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: .LBB25_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -5866,91 +5834,90 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB25_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB25_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: .LBB25_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB25_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[4:5] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB25_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: .LBB25_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_6 @@ -5958,18 +5925,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6032,8 +5998,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] @@ -6045,12 +6012,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6083,14 +6049,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6110,8 +6076,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] @@ -6123,12 +6090,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6152,6 +6118,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6163,10 +6130,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6195,14 +6161,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6229,14 +6195,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6263,17 +6229,17 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6336,10 +6302,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6349,12 +6316,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6389,14 +6355,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6417,10 +6383,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6430,12 +6397,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6460,9 +6426,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6471,10 +6438,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6504,14 +6470,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6539,14 +6505,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6574,17 +6540,17 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6649,10 +6615,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6662,12 +6629,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6703,14 +6669,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6731,10 +6697,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6744,12 +6711,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6774,9 +6740,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6785,10 +6752,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6818,14 +6784,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6853,14 +6819,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6888,17 +6854,17 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6962,8 +6928,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] @@ -6975,10 +6942,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7012,13 +6978,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -7038,8 +7004,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: flat_load_b32 v4, v[0:1] @@ -7051,10 +7018,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7079,6 +7045,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -7090,9 +7057,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7121,13 +7087,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc @@ -7154,13 +7120,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc @@ -7187,16 +7153,16 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7255,36 +7221,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7302,29 +7268,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7334,37 +7300,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7376,31 +7342,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -7411,31 +7377,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7445,31 +7411,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7479,32 +7445,32 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7559,36 +7525,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7607,29 +7573,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7639,37 +7605,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7681,31 +7647,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -7716,31 +7682,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7750,31 +7716,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7784,32 +7750,32 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7865,15 +7831,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7898,14 +7864,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v3, v3, v4 +; GFX940-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 @@ -7924,15 +7890,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX11-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7956,23 +7922,23 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX10-NEXT: v_min_f16_e32 v0, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v6, v6 +; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -7985,14 +7951,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v3, v3, v4 +; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8011,14 +7977,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8039,19 +8005,19 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -8101,24 +8067,24 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8132,23 +8098,23 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v3, v4, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8158,25 +8124,25 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v3, v5, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8190,23 +8156,23 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v3, v5, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -8217,22 +8183,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v3, v4, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8242,22 +8208,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8269,22 +8235,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_min_f16_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8334,10 +8300,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8347,12 +8314,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8388,14 +8354,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -8416,10 +8382,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8429,12 +8396,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8459,9 +8425,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8470,10 +8437,9 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8503,14 +8469,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -8540,14 +8506,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -8575,17 +8541,17 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -8648,37 +8614,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8696,29 +8662,29 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8728,37 +8694,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8770,31 +8736,31 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: flat_load_dword v4, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -8805,33 +8771,33 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8841,31 +8807,31 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8875,32 +8841,32 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12308,15 +12274,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12338,14 +12304,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12363,15 +12329,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12391,14 +12357,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12418,13 +12384,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12442,13 +12408,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12466,21 +12432,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v3, v7, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -12544,15 +12510,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12574,14 +12540,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12599,15 +12565,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12628,21 +12594,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 -; GFX10-NEXT: v_pk_min_f16 v0, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 +; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB47_1 @@ -12655,13 +12621,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12679,13 +12645,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12705,21 +12671,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -12783,15 +12749,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12821,19 +12787,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_pk_max_f16 v0, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX940-NEXT: v_pk_min_f16 v2, v0, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB48_1 @@ -12852,20 +12818,20 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v1, v1 -; GFX11-NEXT: v_pk_min_f16 v0, v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v6, v6 +; GFX11-NEXT: v_pk_min_f16 v5, v0, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12879,21 +12845,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 -; GFX10-NEXT: v_pk_min_f16 v0, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 +; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB48_1 @@ -12910,17 +12876,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_min_f16 v0, v0, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 @@ -12937,17 +12903,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_min_f16 v0, v0, v5 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_pk_max_f16 v0, v6, v6 +; GFX908-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB48_1 @@ -12962,21 +12928,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -13039,21 +13005,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13067,22 +13033,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13092,22 +13058,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13119,21 +13085,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -13144,20 +13110,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13167,20 +13133,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13190,24 +13156,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13266,21 +13232,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13294,22 +13260,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13319,22 +13285,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13348,21 +13314,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -13373,20 +13339,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13396,20 +13362,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13421,24 +13387,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13500,21 +13466,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13532,24 +13498,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] +; GFX940-NEXT: flat_load_dword v3, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13563,22 +13529,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v4, v[3:4] +; GFX11-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13592,21 +13558,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -13623,12 +13589,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX90A-NEXT: v_pk_min_f16 v0, v3, v0 +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -13650,12 +13616,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v1, v1 -; GFX908-NEXT: v_pk_min_f16 v0, v5, v0 +; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13673,24 +13639,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13753,15 +13719,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13784,14 +13750,14 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13809,15 +13775,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13838,21 +13804,21 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 -; GFX10-NEXT: v_pk_min_f16 v0, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 +; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -13865,13 +13831,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13891,13 +13857,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13917,21 +13883,21 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -13994,22 +13960,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14023,22 +13989,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14048,22 +14014,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14077,21 +14043,21 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -14102,22 +14068,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14127,20 +14093,20 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14152,24 +14118,24 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14236,36 +14202,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14282,35 +14250,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 @@ -14323,39 +14291,41 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14370,34 +14340,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -14411,33 +14381,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -14451,33 +14421,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -14491,34 +14461,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -14578,36 +14548,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14624,35 +14596,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 @@ -14665,39 +14637,41 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14713,35 +14687,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_min_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -14754,33 +14728,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -14794,33 +14768,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -14836,34 +14810,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -14923,36 +14897,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14977,35 +14953,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX940-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX940-NEXT: v_min_f32_e32 v0, v7, v0 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX940-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX940-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v0, v3, v0, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -15024,38 +15000,40 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: v_dual_min_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v5, v7, v5 -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15070,35 +15048,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_min_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -15115,33 +15093,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX90A-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX90A-NEXT: v_min_f32_e32 v0, v7, v0 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -15158,33 +15136,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_min_f32_e32 v0, v7, v0 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -15199,34 +15177,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -15285,38 +15263,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15330,38 +15308,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15371,41 +15349,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15418,35 +15396,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -15457,36 +15435,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15496,36 +15474,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15535,37 +15513,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15620,38 +15598,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15665,38 +15643,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15706,41 +15684,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15755,35 +15733,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -15794,36 +15772,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15833,36 +15811,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15874,37 +15852,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15962,38 +15940,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16011,40 +15989,40 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] +; GFX940-NEXT: flat_load_dword v3, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16058,41 +16036,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v4, v[3:4] +; GFX11-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16107,35 +16085,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -16152,28 +16130,28 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0 -; GFX90A-NEXT: v_min_f32_e32 v3, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -16195,28 +16173,28 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v5, v0 -; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -16234,37 +16212,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16323,37 +16301,39 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16370,35 +16350,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB60_1 @@ -16411,39 +16391,41 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16459,35 +16441,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_min_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -16500,35 +16482,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -16542,33 +16524,33 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -16584,34 +16566,34 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -16670,39 +16652,39 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16716,38 +16698,38 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16757,41 +16739,41 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16806,35 +16788,35 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -16845,38 +16827,38 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16886,36 +16868,36 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16927,37 +16909,37 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 1b56fd020d502..14f75814128f1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -13729,36 +13729,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13775,35 +13777,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 @@ -13816,39 +13818,41 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -13863,34 +13867,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -13904,33 +13908,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 @@ -13944,33 +13948,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 @@ -13984,34 +13988,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 @@ -14071,36 +14075,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14117,35 +14123,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 @@ -14158,39 +14164,41 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14206,35 +14214,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -14247,33 +14255,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 @@ -14287,33 +14295,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 @@ -14329,34 +14337,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 @@ -14416,36 +14424,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14470,35 +14480,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX940-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX940-NEXT: v_sub_f32_e32 v0, v7, v0 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX940-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX940-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v0, v3, v0, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB52_1 @@ -14517,38 +14527,40 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: v_dual_sub_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v5, v7, v5 -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14563,35 +14575,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -14608,33 +14620,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX90A-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX90A-NEXT: v_sub_f32_e32 v0, v7, v0 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 @@ -14651,33 +14663,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_sub_f32_e32 v0, v7, v0 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB52_1 @@ -14692,34 +14704,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -14778,38 +14790,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14823,38 +14835,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14864,41 +14876,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14911,35 +14923,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -14950,36 +14962,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14989,36 +15001,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15028,37 +15040,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15113,38 +15125,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15158,38 +15170,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15199,41 +15211,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15248,35 +15260,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -15287,36 +15299,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15326,36 +15338,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15367,37 +15379,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15455,38 +15467,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15504,40 +15516,40 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] +; GFX940-NEXT: flat_load_dword v3, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15551,41 +15563,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v4, v[3:4] +; GFX11-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15600,35 +15612,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -15645,28 +15657,28 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX90A-NEXT: v_sub_f32_e32 v0, v3, v0 -; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -15688,28 +15700,28 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX908-NEXT: v_sub_f32_e32 v0, v5, v0 -; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX908-NEXT: v_sub_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15727,37 +15739,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15816,37 +15828,39 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15863,35 +15877,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -15904,39 +15918,41 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15952,35 +15968,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -15993,35 +16009,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -16035,33 +16051,33 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -16077,34 +16093,34 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -16163,39 +16179,39 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16209,38 +16225,38 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16250,41 +16266,41 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX11-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16299,35 +16315,35 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v3, v[0:1] ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -16338,38 +16354,38 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX90A-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16379,36 +16395,36 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX908-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16420,37 +16436,37 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 61549fa88f424..1311560715ddd 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -2057,21 +2057,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: v_not_b32_e32 v0, v0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB54_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2083,21 +2081,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: v_not_b32_e32 v0, v0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB54_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2109,21 +2105,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_not_b32_e32 v0, v0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB54_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2141,25 +2135,23 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: v_not_b32_e32 v0, v0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset_scalar: @@ -2169,25 +2161,23 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: v_not_b32_e32 v0, v0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset_scalar: @@ -2195,21 +2185,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_not_b32_e32 v0, v0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2227,19 +2215,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: v_not_b32_e32 v0, v0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v4 +; GCN1-NEXT: v_not_b32_e32 v3, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB56_1 @@ -2253,19 +2241,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: v_not_b32_e32 v0, v0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v4 +; GCN2-NEXT: v_not_b32_e32 v3, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB56_1 @@ -2279,19 +2267,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_not_b32_e32 v0, v0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v4 +; GCN3-NEXT: v_not_b32_e32 v3, v0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB56_1 @@ -2308,27 +2296,25 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: v_not_b32_e32 v0, v0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v4 +; GCN1-NEXT: v_not_b32_e32 v3, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB57_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset_scalar: @@ -2336,27 +2322,25 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: v_not_b32_e32 v0, v0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v4 +; GCN2-NEXT: v_not_b32_e32 v3, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB57_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset_scalar: @@ -2365,19 +2349,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_not_b32_e32 v0, v0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v4 +; GCN3-NEXT: v_not_b32_e32 v3, v0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB57_1 @@ -3532,20 +3516,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3557,20 +3539,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3582,20 +3562,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3613,24 +3591,22 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_noret_offset_scalar: @@ -3640,24 +3616,22 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_noret_offset_scalar: @@ -3665,20 +3639,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3696,18 +3668,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_max_i32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB86_1 @@ -3721,18 +3693,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_max_i32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB86_1 @@ -3746,18 +3718,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_max_i32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB86_1 @@ -3774,26 +3746,24 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_max_i32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_offset_scalar: @@ -3801,26 +3771,24 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_max_i32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_offset_scalar: @@ -3829,18 +3797,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_max_i32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB87_1 @@ -3866,21 +3834,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v0, s2, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN1-NEXT: s_cbranch_execnz .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -3898,21 +3864,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v0, s2, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN2-NEXT: s_cbranch_execnz .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -3928,21 +3892,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v0, s2, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -3968,27 +3930,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN1-NEXT: s_cbranch_execnz .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v1, s2 -; GCN1-NEXT: v_mov_b32_e32 v2, s3 -; GCN1-NEXT: flat_store_dword v[1:2], v0 +; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: @@ -4005,27 +3965,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN2-NEXT: s_cbranch_execnz .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v1, s2 -; GCN2-NEXT: v_mov_b32_e32 v2, s3 -; GCN2-NEXT: flat_store_dword v[1:2], v0 +; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: @@ -4040,27 +3998,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v1, s2 -; GCN3-NEXT: v_mov_b32_e32 v2, s3 -; GCN3-NEXT: flat_store_dword v[1:2], v0 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -4082,21 +4038,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v0, s2, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN1-NEXT: s_cbranch_execnz .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -4112,21 +4066,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v0, s2, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN2-NEXT: s_cbranch_execnz .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -4142,21 +4094,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v1, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v0, s2, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -4179,27 +4129,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN1-NEXT: s_cbranch_execnz .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v1, s2 -; GCN1-NEXT: v_mov_b32_e32 v2, s3 -; GCN1-NEXT: flat_store_dword v[1:2], v0 +; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: @@ -4214,27 +4162,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN2-NEXT: s_cbranch_execnz .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v1, s2 -; GCN2-NEXT: v_mov_b32_e32 v2, s3 -; GCN2-NEXT: flat_store_dword v[1:2], v0 +; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: @@ -4249,27 +4195,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: flat_load_dword v2, v[0:1] +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v1, s2 -; GCN3-NEXT: v_mov_b32_e32 v2, s3 -; GCN3-NEXT: flat_store_dword v[1:2], v0 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -4715,20 +4659,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4740,20 +4682,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4765,20 +4705,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4796,24 +4734,22 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset_scalar: @@ -4823,24 +4759,22 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset_scalar: @@ -4848,20 +4782,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4879,18 +4811,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_max_u32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB100_1 @@ -4904,18 +4836,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_max_u32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB100_1 @@ -4929,18 +4861,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_max_u32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB100_1 @@ -4957,26 +4889,24 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_max_u32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset_scalar: @@ -4984,26 +4914,24 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_max_u32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset_scalar: @@ -5012,18 +4940,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_max_u32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB101_1 @@ -5049,21 +4977,19 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v0, s2, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN1-NEXT: s_cbranch_execnz .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -5081,21 +5007,19 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v0, s2, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN2-NEXT: s_cbranch_execnz .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -5111,21 +5035,19 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v0, s2, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -5151,27 +5073,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN1-NEXT: s_cbranch_execnz .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v1, s2 -; GCN1-NEXT: v_mov_b32_e32 v2, s3 -; GCN1-NEXT: flat_store_dword v[1:2], v0 +; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -5188,27 +5108,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN2-NEXT: s_cbranch_execnz .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v1, s2 -; GCN2-NEXT: v_mov_b32_e32 v2, s3 -; GCN2-NEXT: flat_store_dword v[1:2], v0 +; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -5223,27 +5141,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v1, s2 -; GCN3-NEXT: v_mov_b32_e32 v2, s3 -; GCN3-NEXT: flat_store_dword v[1:2], v0 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -5266,27 +5182,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN1-NEXT: s_cbranch_execnz .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v1, s2 -; GCN1-NEXT: v_mov_b32_e32 v2, s3 -; GCN1-NEXT: flat_store_dword v[1:2], v0 +; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: @@ -5301,27 +5215,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN2-NEXT: s_cbranch_execnz .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v1, s2 -; GCN2-NEXT: v_mov_b32_e32 v2, s3 -; GCN2-NEXT: flat_store_dword v[1:2], v0 +; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: @@ -5336,27 +5248,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: flat_load_dword v2, v[0:1] +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v1, s2 -; GCN3-NEXT: v_mov_b32_e32 v2, s3 -; GCN3-NEXT: flat_store_dword v[1:2], v0 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -5802,20 +5712,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5827,20 +5735,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5852,20 +5758,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5883,24 +5787,22 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset_scalar: @@ -5910,24 +5812,22 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset_scalar: @@ -5935,20 +5835,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5966,18 +5864,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_min_u32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB113_1 @@ -5991,18 +5889,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_min_u32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB113_1 @@ -6016,18 +5914,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_min_u32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB113_1 @@ -6044,26 +5942,24 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_min_u32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset_scalar: @@ -6071,26 +5967,24 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_min_u32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset_scalar: @@ -6099,18 +5993,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_min_u32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB114_1 @@ -6559,20 +6453,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6584,20 +6476,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6609,20 +6499,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6640,24 +6528,22 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_noret_offset_scalar: @@ -6667,24 +6553,22 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_noret_offset_scalar: @@ -6692,20 +6576,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6723,18 +6605,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_min_i32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB123_1 @@ -6748,18 +6630,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_min_i32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB123_1 @@ -6773,18 +6655,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_min_i32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB123_1 @@ -6801,26 +6683,24 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_min_i32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_offset_scalar: @@ -6828,26 +6708,24 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_min_i32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_offset_scalar: @@ -6856,18 +6734,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_min_i32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB124_1 @@ -6893,21 +6771,19 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v0, s2, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN1-NEXT: s_cbranch_execnz .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -6925,21 +6801,19 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v0, s2, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN2-NEXT: s_cbranch_execnz .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -6955,21 +6829,19 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v0, s2, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -6995,27 +6867,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN1-NEXT: s_cbranch_execnz .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v1, s2 -; GCN1-NEXT: v_mov_b32_e32 v2, s3 -; GCN1-NEXT: flat_store_dword v[1:2], v0 +; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: @@ -7032,27 +6902,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN2-NEXT: s_cbranch_execnz .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v1, s2 -; GCN2-NEXT: v_mov_b32_e32 v2, s3 -; GCN2-NEXT: flat_store_dword v[1:2], v0 +; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: @@ -7067,27 +6935,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v1, s2 -; GCN3-NEXT: v_mov_b32_e32 v2, s3 -; GCN3-NEXT: flat_store_dword v[1:2], v0 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -7100,78 +6966,72 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb -; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb +; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN1-NEXT: s_cbranch_execnz .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c -; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN2-NEXT: s_cbranch_execnz .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -7193,27 +7053,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN1-NEXT: s_cbranch_execnz .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v1, s2 -; GCN1-NEXT: v_mov_b32_e32 v2, s3 -; GCN1-NEXT: flat_store_dword v[1:2], v0 +; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: @@ -7228,27 +7086,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN2-NEXT: s_cbranch_execnz .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v1, s2 -; GCN2-NEXT: v_mov_b32_e32 v2, s3 -; GCN2-NEXT: flat_store_dword v[1:2], v0 +; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: @@ -7263,27 +7119,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: flat_load_dword v2, v[0:1] +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v1, s2 -; GCN3-NEXT: v_mov_b32_e32 v2, s3 -; GCN3-NEXT: flat_store_dword v[1:2], v0 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 66ac4d2198ea5..36bddb7ac2fd6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -6998,8 +6998,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: v_not_b32_e32 v1, v0 ; GCN1-NEXT: v_not_b32_e32 v0, v6 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7065,8 +7063,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: v_not_b32_e32 v1, v0 ; GCN2-NEXT: v_not_b32_e32 v0, v6 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7115,17 +7111,15 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: .LBB54_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB54_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB54_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: v_not_b32_e32 v1, v0 ; GCN3-NEXT: v_not_b32_e32 v0, v6 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7194,8 +7188,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN1-NEXT: v_mov_b32_e32 v4, s34 -; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: v_not_b32_e32 v1, v0 ; GCN1-NEXT: v_not_b32_e32 v0, v6 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7263,8 +7255,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: v_not_b32_e32 v1, v0 ; GCN2-NEXT: v_not_b32_e32 v0, v6 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7315,17 +7305,15 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB55_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB55_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB55_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: v_not_b32_e32 v1, v0 ; GCN3-NEXT: v_not_b32_e32 v0, v6 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7385,18 +7373,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 -; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: v_not_b32_e32 v1, v0 -; GCN1-NEXT: v_not_b32_e32 v0, v6 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN1-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN1-NEXT: v_not_b32_e32 v5, v0 +; GCN1-NEXT: v_not_b32_e32 v4, v1 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB56_2 @@ -7450,18 +7436,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 -; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: v_not_b32_e32 v1, v0 -; GCN2-NEXT: v_not_b32_e32 v0, v6 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN2-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN2-NEXT: v_not_b32_e32 v5, v0 +; GCN2-NEXT: v_not_b32_e32 v4, v1 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB56_2 @@ -7500,25 +7484,23 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB56_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 -; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: v_not_b32_e32 v1, v0 -; GCN3-NEXT: v_not_b32_e32 v0, v6 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN3-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN3-NEXT: v_not_b32_e32 v5, v0 +; GCN3-NEXT: v_not_b32_e32 v4, v1 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB56_2 @@ -7575,18 +7557,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_mov_b32_e32 v4, s34 -; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 -; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN1-NEXT: v_mov_b32_e32 v5, s35 -; GCN1-NEXT: v_not_b32_e32 v1, v0 -; GCN1-NEXT: v_not_b32_e32 v0, v6 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN1-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN1-NEXT: v_not_b32_e32 v5, v0 +; GCN1-NEXT: v_not_b32_e32 v4, v1 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB57_2 @@ -7642,18 +7622,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 -; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 -; GCN2-NEXT: v_not_b32_e32 v1, v0 -; GCN2-NEXT: v_not_b32_e32 v0, v6 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN2-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN2-NEXT: v_not_b32_e32 v5, v0 +; GCN2-NEXT: v_not_b32_e32 v4, v1 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB57_2 @@ -7694,25 +7672,23 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB57_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 -; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: v_not_b32_e32 v1, v0 -; GCN3-NEXT: v_not_b32_e32 v0, v6 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN3-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN3-NEXT: v_not_b32_e32 v5, v0 +; GCN3-NEXT: v_not_b32_e32 v4, v1 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB57_2 @@ -11446,16 +11422,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -11514,16 +11488,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -11570,20 +11542,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: .LBB84_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB84_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -11645,16 +11615,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s34 -; GCN1-NEXT: v_mov_b32_e32 v5, s35 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -11715,16 +11683,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -11773,20 +11739,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB85_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB85_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -11841,22 +11805,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB86_2 @@ -11907,22 +11869,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB86_2 @@ -11961,26 +11921,24 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB86_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB86_2 @@ -12034,22 +11992,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s34 -; GCN1-NEXT: v_mov_b32_e32 v5, s35 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB87_2 @@ -12102,22 +12058,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB87_2 @@ -12158,26 +12112,24 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB87_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB87_2 @@ -12238,20 +12190,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: .LBB88_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB88_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s3 +; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s3 -; GCN1-NEXT: v_mov_b32_e32 v6, s2 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -12311,20 +12261,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: .LBB88_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB88_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s3 +; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s3 -; GCN2-NEXT: v_mov_b32_e32 v6, s2 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -12383,20 +12331,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: .LBB88_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB88_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s3 +; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s3 -; GCN3-NEXT: v_mov_b32_e32 v6, s2 -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -12453,26 +12399,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB89_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s13 -; GCN1-NEXT: v_mov_b32_e32 v6, s12 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB89_2 @@ -12526,26 +12470,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB89_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s13 -; GCN2-NEXT: v_mov_b32_e32 v6, s12 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB89_2 @@ -12598,26 +12540,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB89_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s13 -; GCN3-NEXT: v_mov_b32_e32 v6, s12 -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB89_2 @@ -12681,20 +12621,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s3 +; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s3 -; GCN1-NEXT: v_mov_b32_e32 v6, s2 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -12752,20 +12690,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s3 +; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s3 -; GCN2-NEXT: v_mov_b32_e32 v6, s2 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -12822,20 +12758,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s3 +; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s3 -; GCN3-NEXT: v_mov_b32_e32 v6, s2 -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -12889,26 +12823,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB91_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s13 -; GCN1-NEXT: v_mov_b32_e32 v6, s12 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB91_2 @@ -12960,26 +12892,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB91_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s13 -; GCN2-NEXT: v_mov_b32_e32 v6, s12 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB91_2 @@ -13030,26 +12960,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB91_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s13 -; GCN3-NEXT: v_mov_b32_e32 v6, s12 -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB91_2 @@ -14197,16 +14125,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -14265,16 +14191,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -14321,20 +14245,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: .LBB98_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB98_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -14396,16 +14318,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s34 -; GCN1-NEXT: v_mov_b32_e32 v5, s35 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -14466,16 +14386,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -14524,20 +14442,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB99_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB99_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -14592,22 +14508,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB100_2 @@ -14658,22 +14572,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB100_2 @@ -14712,26 +14624,24 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB100_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB100_2 @@ -14785,22 +14695,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s34 -; GCN1-NEXT: v_mov_b32_e32 v5, s35 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB101_2 @@ -14853,22 +14761,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB101_2 @@ -14909,26 +14815,24 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB101_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB101_2 @@ -14989,20 +14893,18 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: .LBB102_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB102_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s3 +; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s3 -; GCN1-NEXT: v_mov_b32_e32 v6, s2 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -15062,20 +14964,18 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: .LBB102_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB102_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s3 +; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s3 -; GCN2-NEXT: v_mov_b32_e32 v6, s2 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -15134,20 +15034,18 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: .LBB102_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB102_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s3 +; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s3 -; GCN3-NEXT: v_mov_b32_e32 v6, s2 -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -15204,26 +15102,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB103_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s13 -; GCN1-NEXT: v_mov_b32_e32 v6, s12 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB103_2 @@ -15277,26 +15173,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB103_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s13 -; GCN2-NEXT: v_mov_b32_e32 v6, s12 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB103_2 @@ -15349,26 +15243,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB103_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s13 -; GCN3-NEXT: v_mov_b32_e32 v6, s12 -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB103_2 @@ -15425,26 +15317,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB104_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s13 -; GCN1-NEXT: v_mov_b32_e32 v6, s12 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB104_2 @@ -15496,26 +15386,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB104_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s13 -; GCN2-NEXT: v_mov_b32_e32 v6, s12 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB104_2 @@ -15566,26 +15454,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB104_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s13 -; GCN3-NEXT: v_mov_b32_e32 v6, s12 -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB104_2 @@ -16733,16 +16619,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -16801,16 +16685,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -16857,20 +16739,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: .LBB111_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB111_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -16932,16 +16812,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s34 -; GCN1-NEXT: v_mov_b32_e32 v5, s35 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -17002,16 +16880,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -17060,20 +16936,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB112_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB112_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -17128,22 +17002,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB113_2 @@ -17194,22 +17066,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB113_2 @@ -17248,26 +17118,24 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB113_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB113_2 @@ -17321,22 +17189,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s34 -; GCN1-NEXT: v_mov_b32_e32 v5, s35 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB114_2 @@ -17389,22 +17255,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB114_2 @@ -17445,26 +17309,24 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB114_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB114_2 @@ -18608,16 +18470,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -18676,16 +18536,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -18732,20 +18590,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: .LBB121_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB121_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -18807,16 +18663,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s34 -; GCN1-NEXT: v_mov_b32_e32 v5, s35 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -18877,16 +18731,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -18935,20 +18787,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB122_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB122_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -19003,22 +18853,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB123_2 @@ -19069,22 +18917,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB123_2 @@ -19123,26 +18969,24 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB123_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB123_2 @@ -19196,22 +19040,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s7 -; GCN1-NEXT: v_mov_b32_e32 v6, s6 -; GCN1-NEXT: v_mov_b32_e32 v4, s34 -; GCN1-NEXT: v_mov_b32_e32 v5, s35 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB124_2 @@ -19264,22 +19106,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s7 -; GCN2-NEXT: v_mov_b32_e32 v6, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB124_2 @@ -19320,26 +19160,24 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB124_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s7 -; GCN3-NEXT: v_mov_b32_e32 v6, s6 -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB124_2 @@ -19400,20 +19238,18 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: .LBB125_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB125_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s3 +; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s3 -; GCN1-NEXT: v_mov_b32_e32 v6, s2 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -19473,20 +19309,18 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: .LBB125_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB125_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s3 +; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s3 -; GCN2-NEXT: v_mov_b32_e32 v6, s2 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -19545,20 +19379,18 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: .LBB125_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB125_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s3 +; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s3 -; GCN3-NEXT: v_mov_b32_e32 v6, s2 -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -19615,26 +19447,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB126_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s13 -; GCN1-NEXT: v_mov_b32_e32 v6, s12 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB126_2 @@ -19688,26 +19518,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB126_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s13 -; GCN2-NEXT: v_mov_b32_e32 v6, s12 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB126_2 @@ -19760,26 +19588,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB126_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s13 -; GCN3-NEXT: v_mov_b32_e32 v6, s12 -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB126_2 @@ -19839,20 +19665,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: .LBB127_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB127_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s3 +; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s3 -; GCN1-NEXT: v_mov_b32_e32 v6, s2 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -19906,20 +19730,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: .LBB127_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB127_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s3 +; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s3 -; GCN2-NEXT: v_mov_b32_e32 v6, s2 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -19972,20 +19794,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB127_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB127_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s3 +; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s3 -; GCN3-NEXT: v_mov_b32_e32 v6, s2 -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -20038,26 +19858,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB128_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v0, s13 -; GCN1-NEXT: v_mov_b32_e32 v6, s12 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB128_2 @@ -20109,26 +19927,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB128_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v0, s13 -; GCN2-NEXT: v_mov_b32_e32 v6, s12 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB128_2 @@ -20179,26 +19995,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB128_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s13 -; GCN3-NEXT: v_mov_b32_e32 v6, s12 -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB128_2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index 4a4fcfe5afcc8..fe47461ebf956 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -2187,14 +2187,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_not_b32_e32 v1, v0 ; GFX7-NEXT: v_not_b32_e32 v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2221,14 +2221,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_not_b32_e32 v1, v0 ; GFX8-NEXT: v_not_b32_e32 v0, v6 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2250,14 +2250,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_not_b32_e32 v1, v0 ; GFX9-NEXT: v_not_b32_e32 v0, v6 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2290,14 +2290,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[36:37], 0 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: v_not_b32_e32 v1, v0 ; GFX7-NEXT: v_not_b32_e32 v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2305,12 +2303,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_nand_i64_noret_offset_scalar: @@ -2326,14 +2324,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[36:37], 0 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 -; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: v_not_b32_e32 v1, v0 ; GFX8-NEXT: v_not_b32_e32 v0, v6 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2341,12 +2337,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_nand_i64_noret_offset_scalar: @@ -2355,14 +2351,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_not_b32_e32 v1, v0 ; GFX9-NEXT: v_not_b32_e32 v0, v6 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc @@ -2394,22 +2390,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 -; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_not_b32_e32 v1, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v6 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX7-NEXT: v_not_b32_e32 v5, v0 +; GFX7-NEXT: v_not_b32_e32 v4, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 @@ -2428,22 +2424,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 -; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_not_b32_e32 v1, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v6 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX8-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX8-NEXT: v_not_b32_e32 v5, v0 +; GFX8-NEXT: v_not_b32_e32 v4, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -2457,22 +2453,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 -; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_not_b32_e32 v1, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX9-NEXT: v_not_b32_e32 v5, v0 +; GFX9-NEXT: v_not_b32_e32 v4, v1 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB56_1 @@ -2497,27 +2493,25 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[36:37], 0 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 -; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, s35 -; GFX7-NEXT: v_not_b32_e32 v1, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v6 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX7-NEXT: v_not_b32_e32 v5, v0 +; GFX7-NEXT: v_not_b32_e32 v4, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_nand_i64_ret_offset_scalar: @@ -2533,27 +2527,25 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[36:37], 0 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 -; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 -; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, s35 -; GFX8-NEXT: v_not_b32_e32 v1, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v6 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX8-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX8-NEXT: v_not_b32_e32 v5, v0 +; GFX8-NEXT: v_not_b32_e32 v4, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_nand_i64_ret_offset_scalar: @@ -2562,22 +2554,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 -; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_not_b32_e32 v1, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX9-NEXT: v_not_b32_e32 v5, v0 +; GFX9-NEXT: v_not_b32_e32 v4, v1 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB57_1 @@ -3857,17 +3849,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -3892,17 +3884,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -3922,17 +3914,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3963,28 +3955,26 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[36:37], 0 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 ; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-NEXT: v_mov_b32_e32 v5, s35 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB85_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_max_i64_noret_offset_scalar: @@ -4000,28 +3990,26 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[36:37], 0 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 -; GFX8-NEXT: v_mov_b32_e32 v5, s35 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB85_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_max_i64_noret_offset_scalar: @@ -4030,17 +4018,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4070,23 +4058,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB86_1 @@ -4105,23 +4093,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB86_1 @@ -4135,23 +4123,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB86_1 @@ -4176,28 +4164,26 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[36:37], 0 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-NEXT: v_mov_b32_e32 v5, s35 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB87_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_max_i64_ret_offset_scalar: @@ -4213,28 +4199,26 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[36:37], 0 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 -; GFX8-NEXT: v_mov_b32_e32 v5, s35 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB87_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_max_i64_ret_offset_scalar: @@ -4243,23 +4227,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB87_1 @@ -4282,28 +4266,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_mov_b32_e32 v6, s2 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB88_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -4318,28 +4300,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz .LBB88_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -4352,28 +4332,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4396,32 +4374,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_mov_b32_e32 v6, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB89_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: @@ -4435,69 +4411,65 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz .LBB89_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v6, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4516,28 +4488,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_mov_b32_e32 v6, s2 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB90_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -4550,28 +4520,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz .LBB90_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -4584,28 +4552,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4625,32 +4591,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_mov_b32_e32 v6, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB91_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64: @@ -4662,69 +4626,65 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz .LBB91_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v6, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5271,17 +5231,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -5306,17 +5266,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -5336,17 +5296,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5377,28 +5337,26 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[36:37], 0 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 ; GFX7-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-NEXT: v_mov_b32_e32 v5, s35 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB99_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umax_i64_noret_offset_scalar: @@ -5414,28 +5372,26 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[36:37], 0 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 -; GFX8-NEXT: v_mov_b32_e32 v5, s35 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB99_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umax_i64_noret_offset_scalar: @@ -5444,17 +5400,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5484,23 +5440,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB100_1 @@ -5519,23 +5475,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB100_1 @@ -5549,23 +5505,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB100_1 @@ -5590,28 +5546,26 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[36:37], 0 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-NEXT: v_mov_b32_e32 v5, s35 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB101_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umax_i64_ret_offset_scalar: @@ -5627,28 +5581,26 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[36:37], 0 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 -; GFX8-NEXT: v_mov_b32_e32 v5, s35 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB101_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umax_i64_ret_offset_scalar: @@ -5657,23 +5609,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB101_1 @@ -5696,28 +5648,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_mov_b32_e32 v6, s2 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB102_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -5732,28 +5682,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz .LBB102_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -5766,28 +5714,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -5810,32 +5756,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_mov_b32_e32 v6, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB103_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: @@ -5849,69 +5793,65 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz .LBB103_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v6, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5931,32 +5871,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_mov_b32_e32 v6, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB104_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64: @@ -5968,69 +5906,65 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz .LBB104_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v6, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6577,17 +6511,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -6612,17 +6546,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -6642,17 +6576,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6683,28 +6617,26 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[36:37], 0 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 ; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-NEXT: v_mov_b32_e32 v5, s35 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB112_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umin_i64_noret_offset_scalar: @@ -6720,28 +6652,26 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[36:37], 0 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 -; GFX8-NEXT: v_mov_b32_e32 v5, s35 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB112_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umin_i64_noret_offset_scalar: @@ -6750,17 +6680,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6790,23 +6720,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB113_1 @@ -6825,23 +6755,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB113_1 @@ -6855,23 +6785,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB113_1 @@ -6896,28 +6826,26 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[36:37], 0 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-NEXT: v_mov_b32_e32 v5, s35 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB114_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umin_i64_ret_offset_scalar: @@ -6933,28 +6861,26 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[36:37], 0 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 -; GFX8-NEXT: v_mov_b32_e32 v5, s35 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB114_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umin_i64_ret_offset_scalar: @@ -6963,23 +6889,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB114_1 @@ -7529,17 +7455,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -7564,17 +7490,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -7594,17 +7520,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7635,28 +7561,26 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[36:37], 0 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 ; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-NEXT: v_mov_b32_e32 v5, s35 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB122_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_min_i64_noret_offset_scalar: @@ -7672,28 +7596,26 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[36:37], 0 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 -; GFX8-NEXT: v_mov_b32_e32 v5, s35 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB122_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_min_i64_noret_offset_scalar: @@ -7702,17 +7624,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7742,23 +7664,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB123_1 @@ -7777,23 +7699,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB123_1 @@ -7807,23 +7729,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB123_1 @@ -7848,28 +7770,26 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[36:37], 0 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-NEXT: v_mov_b32_e32 v5, s35 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB124_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_min_i64_ret_offset_scalar: @@ -7885,28 +7805,26 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[36:37], 0 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 -; GFX8-NEXT: v_mov_b32_e32 v5, s35 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB124_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_min_i64_ret_offset_scalar: @@ -7915,23 +7833,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB124_1 @@ -7954,28 +7872,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_mov_b32_e32 v6, s2 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB125_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -7990,28 +7906,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz .LBB125_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -8024,28 +7938,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -8068,32 +7980,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_mov_b32_e32 v6, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB126_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: @@ -8107,69 +8017,65 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz .LBB126_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v6, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -8188,16 +8094,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_mov_b32_e32 v6, s2 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -8218,16 +8124,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -8248,16 +8154,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8284,32 +8190,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_mov_b32_e32 v6, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB128_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64: @@ -8321,69 +8225,65 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz .LBB128_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v6, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 09230d21544ff..4aec2ffead437 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1426,23 +1426,23 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index d0fc798e55b6e..ec4ea232e661c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -18501,39 +18501,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18548,34 +18550,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB78_1 @@ -18589,33 +18591,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 @@ -18629,33 +18631,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB78_1 @@ -18669,34 +18671,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB78_1 @@ -18827,39 +18829,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18874,34 +18878,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB79_1 @@ -18915,33 +18919,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 @@ -18955,33 +18959,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB79_1 @@ -18997,34 +19001,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB79_1 @@ -19155,39 +19159,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19202,34 +19208,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB80_1 @@ -19243,33 +19249,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB80_1 @@ -19283,33 +19289,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB80_1 @@ -19325,34 +19331,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB80_1 @@ -19486,41 +19492,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19533,35 +19539,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB81_1 @@ -19572,36 +19578,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB81_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19611,36 +19617,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB81_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19650,37 +19656,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB81_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19804,41 +19810,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19851,35 +19857,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB82_1 @@ -19890,36 +19896,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB82_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19929,36 +19935,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB82_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19970,37 +19976,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB82_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20125,41 +20131,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20172,35 +20178,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB83_1 @@ -20211,36 +20217,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB83_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20250,36 +20256,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB83_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20291,37 +20297,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB83_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20456,39 +20462,41 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20503,34 +20511,34 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB84_1 @@ -20544,35 +20552,35 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB84_1 @@ -20586,33 +20594,33 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB84_1 @@ -20628,34 +20636,34 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB84_1 @@ -20786,41 +20794,41 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20833,35 +20841,35 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB85_1 @@ -20872,38 +20880,38 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20913,36 +20921,36 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB85_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20954,37 +20962,37 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB85_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21110,39 +21118,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -21157,34 +21167,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB86_1 @@ -21198,33 +21208,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 @@ -21238,33 +21248,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB86_1 @@ -21278,34 +21288,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB86_1 @@ -21435,41 +21445,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -21482,35 +21492,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB87_1 @@ -21521,36 +21531,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21560,36 +21570,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB87_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21599,37 +21609,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB87_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21754,39 +21764,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -21801,34 +21813,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB88_1 @@ -21842,33 +21854,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 @@ -21882,33 +21894,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB88_1 @@ -21922,34 +21934,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB88_1 @@ -22079,41 +22091,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -22126,35 +22138,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB89_1 @@ -22165,36 +22177,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB89_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22204,36 +22216,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB89_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22243,37 +22255,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB89_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22398,39 +22410,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -22445,34 +22459,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB90_1 @@ -22486,33 +22500,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB90_1 @@ -22526,33 +22540,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB90_1 @@ -22566,34 +22580,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB90_1 @@ -22723,41 +22737,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -22770,35 +22784,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB91_1 @@ -22809,36 +22823,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB91_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22848,36 +22862,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB91_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22887,37 +22901,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB91_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -23156,34 +23170,34 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; ; GFX8-LABEL: infer_as_before_atomic: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB92_3 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX8-NEXT: s_bcnt1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v4, s5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f32_e32 v2, v3, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz .LBB92_2 ; GFX8-NEXT: .LBB92_3: ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index d70159b0b0ac7..e2fde562d36b1 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -32,13 +32,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -77,13 +77,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -101,13 +101,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -193,13 +193,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -238,13 +238,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -262,13 +262,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -356,13 +356,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -401,13 +401,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -425,13 +425,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -517,21 +517,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -561,20 +561,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -584,20 +584,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -672,21 +672,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -716,20 +716,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -739,20 +739,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -830,21 +830,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -874,20 +874,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -897,20 +897,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -991,13 +991,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1036,13 +1036,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1062,13 +1062,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1155,21 +1155,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1199,22 +1199,22 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1224,20 +1224,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1317,13 +1317,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1341,14 +1341,15 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1368,14 +1369,14 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1395,13 +1396,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1419,13 +1420,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1546,13 +1547,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1591,13 +1592,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1615,13 +1616,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1711,13 +1712,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1756,13 +1757,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1780,13 +1781,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1872,13 +1873,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1917,13 +1918,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1941,13 +1942,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2035,13 +2036,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2080,13 +2081,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2104,13 +2105,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2196,21 +2197,21 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2240,20 +2241,20 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2263,20 +2264,20 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2351,21 +2352,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2395,20 +2396,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2418,20 +2419,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2509,21 +2510,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2553,20 +2554,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2576,20 +2577,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2670,13 +2671,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2715,13 +2716,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2741,13 +2742,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2834,21 +2835,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2878,22 +2879,22 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2903,20 +2904,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2990,15 +2991,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3028,15 +3029,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3074,15 +3075,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3100,15 +3101,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3163,15 +3164,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3201,15 +3202,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3247,15 +3248,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3275,15 +3276,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3337,15 +3338,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3375,15 +3376,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3421,15 +3422,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3449,15 +3450,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3510,21 +3511,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3547,22 +3548,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3592,21 +3593,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3616,21 +3617,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3673,21 +3674,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3710,22 +3711,22 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3755,21 +3756,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3781,21 +3782,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3839,21 +3840,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3876,22 +3877,22 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3921,21 +3922,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3947,21 +3948,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4006,15 +4007,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4044,15 +4045,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4072,15 +4073,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX10-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4101,13 +4102,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -4125,15 +4126,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4151,15 +4152,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4177,28 +4178,26 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 ; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[2:3] +; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 ; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4213,14 +4212,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4228,14 +4226,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: v_mov_b32_e32 v11, v1 ; GFX6-NEXT: v_mov_b32_e32 v10, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[2:3] +; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 ; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4259,15 +4256,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4297,15 +4294,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4343,15 +4340,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4369,15 +4366,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4436,8 +4433,9 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off @@ -4449,12 +4447,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4487,14 +4484,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -4514,8 +4511,9 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off @@ -4527,12 +4525,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4556,6 +4553,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -4567,10 +4565,9 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4599,14 +4596,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -4633,14 +4630,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -4667,17 +4664,17 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4790,10 +4787,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4803,12 +4801,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4843,14 +4840,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -4871,10 +4868,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4884,12 +4882,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4914,9 +4911,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4925,10 +4923,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4958,14 +4955,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -4993,14 +4990,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5028,17 +5025,17 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5155,10 +5152,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5168,12 +5166,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5209,14 +5206,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -5237,10 +5234,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5250,12 +5248,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5280,9 +5277,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5291,10 +5289,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5324,14 +5321,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -5359,14 +5356,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5394,17 +5391,17 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5520,8 +5517,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off @@ -5533,10 +5531,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -5570,13 +5567,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -5596,8 +5593,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: global_load_b32 v4, v[0:1], off @@ -5609,10 +5607,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -5637,6 +5634,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -5648,9 +5646,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5679,13 +5676,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc @@ -5712,13 +5709,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc @@ -5745,16 +5742,16 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5862,36 +5859,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5909,29 +5906,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5941,37 +5938,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5983,31 +5980,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -6018,31 +6015,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6052,31 +6049,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6086,32 +6083,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6216,36 +6213,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6264,29 +6261,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6296,37 +6293,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6338,31 +6335,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -6373,31 +6370,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6407,31 +6404,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6441,32 +6438,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6572,15 +6569,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -6605,14 +6602,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v3, v3, v4 +; GFX940-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 @@ -6631,15 +6628,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX11-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -6662,14 +6659,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v4, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6691,14 +6688,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v4 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6717,14 +6714,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v5 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -6745,19 +6742,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -6847,24 +6844,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6878,23 +6875,23 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v4, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v3, v4, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6904,25 +6901,25 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v5, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6934,24 +6931,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v5, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6961,22 +6958,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v3, v4, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6986,22 +6983,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7013,22 +7010,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7117,10 +7114,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7130,12 +7128,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7171,14 +7168,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -7199,10 +7196,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7212,12 +7210,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7242,9 +7239,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7253,10 +7251,9 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7286,14 +7283,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -7323,14 +7320,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -7358,17 +7355,17 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7483,37 +7480,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7531,29 +7528,29 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7563,37 +7560,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7605,31 +7602,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -7640,33 +7637,33 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7676,31 +7673,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7710,32 +7707,32 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11684,15 +11681,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -11714,14 +11711,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11739,15 +11736,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -11767,14 +11764,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11794,13 +11791,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -11818,13 +11815,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -11842,21 +11839,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v3, v7, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -11977,15 +11974,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12007,14 +12004,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -12032,15 +12029,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -12060,14 +12057,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12087,13 +12084,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12111,13 +12108,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12137,21 +12134,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -12272,15 +12269,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12302,14 +12299,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -12327,15 +12324,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -12355,14 +12352,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12382,13 +12379,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12406,13 +12403,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12432,21 +12429,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -12570,21 +12567,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12598,22 +12595,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12623,22 +12620,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12650,21 +12647,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -12675,20 +12672,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12698,20 +12695,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12721,24 +12718,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12852,21 +12849,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12880,22 +12877,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12905,22 +12902,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12932,21 +12929,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -12957,20 +12954,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12980,20 +12977,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13005,24 +13002,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13137,21 +13134,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13165,22 +13162,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13190,22 +13187,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13217,21 +13214,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -13242,20 +13239,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13265,20 +13262,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13290,24 +13287,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13431,15 +13428,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13462,14 +13459,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -13487,15 +13484,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -13515,14 +13512,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13542,13 +13539,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -13568,13 +13565,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13594,21 +13591,21 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -13728,22 +13725,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13757,22 +13754,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13782,22 +13779,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13809,21 +13806,21 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -13834,22 +13831,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13859,20 +13856,20 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13884,24 +13881,24 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14021,36 +14018,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14067,35 +14066,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 @@ -14108,39 +14107,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14155,34 +14156,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -14196,33 +14197,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -14236,33 +14237,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -14276,34 +14277,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -14416,36 +14417,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14462,35 +14465,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 @@ -14503,39 +14506,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14550,34 +14555,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -14591,33 +14596,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -14631,33 +14636,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -14673,34 +14678,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -14813,36 +14818,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14859,35 +14866,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -14900,39 +14907,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14947,34 +14956,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -14988,33 +14997,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -15028,33 +15037,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -15070,34 +15079,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -15213,38 +15222,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15258,38 +15267,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15299,41 +15308,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15346,35 +15355,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -15385,36 +15394,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15424,36 +15433,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15463,37 +15472,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15599,38 +15608,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15644,38 +15653,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15685,41 +15694,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15732,35 +15741,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -15771,36 +15780,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15810,36 +15819,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15851,37 +15860,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15988,38 +15997,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16033,38 +16042,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16074,41 +16083,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16121,35 +16130,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -16160,36 +16169,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16199,36 +16208,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16240,37 +16249,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16386,37 +16395,39 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16433,35 +16444,35 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB60_1 @@ -16474,39 +16485,41 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16521,34 +16534,34 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -16562,35 +16575,35 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -16604,33 +16617,33 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -16646,34 +16659,34 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -16785,39 +16798,39 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16831,38 +16844,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16872,41 +16885,41 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16919,35 +16932,35 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -16958,38 +16971,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16999,36 +17012,36 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17040,37 +17053,37 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index ecea5fdecfcd0..903e80b15814f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -32,13 +32,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -77,13 +77,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -101,13 +101,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -193,13 +193,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -238,13 +238,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -262,13 +262,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -356,13 +356,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -401,13 +401,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -425,13 +425,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -517,21 +517,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -561,20 +561,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -584,20 +584,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -672,21 +672,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -716,20 +716,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -739,20 +739,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -830,21 +830,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -874,20 +874,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -897,20 +897,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -991,13 +991,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1036,13 +1036,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1062,13 +1062,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1155,21 +1155,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1199,22 +1199,22 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1224,20 +1224,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1317,13 +1317,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1341,14 +1341,15 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1368,14 +1369,14 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1395,13 +1396,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1419,13 +1420,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1546,13 +1547,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1591,13 +1592,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1615,13 +1616,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1711,13 +1712,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1756,13 +1757,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1780,13 +1781,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1872,13 +1873,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1917,13 +1918,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1941,13 +1942,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2035,13 +2036,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2080,13 +2081,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2104,13 +2105,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2196,21 +2197,21 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2240,20 +2241,20 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2263,20 +2264,20 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2351,21 +2352,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2395,20 +2396,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2418,20 +2419,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2509,21 +2510,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2553,20 +2554,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2576,20 +2577,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2670,13 +2671,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2715,13 +2716,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2741,13 +2742,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2834,21 +2835,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2878,22 +2879,22 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2903,20 +2904,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2990,15 +2991,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3028,15 +3029,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3074,15 +3075,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3100,15 +3101,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3163,15 +3164,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3201,15 +3202,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3247,15 +3248,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3275,15 +3276,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3337,15 +3338,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3375,15 +3376,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3421,15 +3422,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3449,15 +3450,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3510,21 +3511,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3547,22 +3548,22 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3592,21 +3593,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3616,21 +3617,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3673,21 +3674,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3710,22 +3711,22 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3755,21 +3756,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3781,21 +3782,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3839,21 +3840,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3876,22 +3877,22 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3921,21 +3922,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3947,21 +3948,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4006,15 +4007,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4044,15 +4045,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4072,15 +4073,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX10-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4101,13 +4102,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -4125,15 +4126,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4151,15 +4152,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4177,28 +4178,26 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 ; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[2:3] +; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 ; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4213,14 +4212,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4228,14 +4226,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: v_mov_b32_e32 v11, v1 ; GFX6-NEXT: v_mov_b32_e32 v10, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[2:3] +; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 ; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4259,15 +4256,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4297,15 +4294,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4343,15 +4340,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4369,15 +4366,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4436,8 +4433,9 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off @@ -4449,12 +4447,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4487,14 +4484,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -4514,8 +4511,9 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off @@ -4527,12 +4525,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4556,6 +4553,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -4567,10 +4565,9 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4599,14 +4596,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -4633,14 +4630,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -4667,17 +4664,17 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4790,10 +4787,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4803,12 +4801,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4843,14 +4840,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -4871,10 +4868,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4884,12 +4882,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4914,9 +4911,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4925,10 +4923,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4958,14 +4955,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -4993,14 +4990,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5028,17 +5025,17 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5155,10 +5152,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5168,12 +5166,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5209,14 +5206,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -5237,10 +5234,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5250,12 +5248,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5280,9 +5277,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5291,10 +5289,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5324,14 +5321,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -5359,14 +5356,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5394,17 +5391,17 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5520,8 +5517,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off @@ -5533,10 +5531,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -5570,13 +5567,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -5596,8 +5593,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: global_load_b32 v4, v[0:1], off @@ -5609,10 +5607,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -5637,6 +5634,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -5648,9 +5646,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5679,13 +5676,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc @@ -5712,13 +5709,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc @@ -5745,16 +5742,16 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5862,36 +5859,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5909,29 +5906,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5941,37 +5938,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5983,31 +5980,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -6018,31 +6015,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6052,31 +6049,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6086,32 +6083,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6216,36 +6213,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6264,29 +6261,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6296,37 +6293,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6338,31 +6335,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -6373,31 +6370,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6407,31 +6404,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6441,32 +6438,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6572,15 +6569,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -6605,14 +6602,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v3, v3, v4 +; GFX940-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 @@ -6631,15 +6628,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX11-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -6662,14 +6659,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v4, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6691,14 +6688,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v3, v3, v4 +; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6717,14 +6714,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v5 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -6745,19 +6742,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -6847,24 +6844,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6878,23 +6875,23 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v4, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v3, v4, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6904,25 +6901,25 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v3, v5, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6934,24 +6931,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v3, v5, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6961,22 +6958,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v3, v4, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6986,22 +6983,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7013,22 +7010,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_min_f16_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7117,10 +7114,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7130,12 +7128,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7171,14 +7168,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -7199,10 +7196,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7212,12 +7210,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7242,9 +7239,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7253,10 +7251,9 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7286,14 +7283,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -7323,14 +7320,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -7358,17 +7355,17 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 -; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7483,37 +7480,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7531,29 +7528,29 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7563,37 +7560,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7605,31 +7602,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -7640,33 +7637,33 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7676,31 +7673,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7710,32 +7707,32 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 -; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11684,15 +11681,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -11714,14 +11711,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11739,15 +11736,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -11767,14 +11764,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11794,13 +11791,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -11818,13 +11815,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -11842,21 +11839,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v3, v7, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -11977,15 +11974,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12007,14 +12004,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -12032,15 +12029,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -12060,14 +12057,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12087,13 +12084,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12111,13 +12108,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12137,21 +12134,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -12272,15 +12269,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12302,14 +12299,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -12327,15 +12324,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -12355,14 +12352,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12382,13 +12379,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12406,13 +12403,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12432,21 +12429,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -12570,21 +12567,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12598,22 +12595,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12623,22 +12620,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12650,21 +12647,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -12675,20 +12672,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12698,20 +12695,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12721,24 +12718,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12852,21 +12849,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12880,22 +12877,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12905,22 +12902,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12932,21 +12929,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -12957,20 +12954,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12980,20 +12977,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13005,24 +13002,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13137,21 +13134,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13165,22 +13162,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13190,22 +13187,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13217,21 +13214,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -13242,20 +13239,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13265,20 +13262,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13290,24 +13287,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13431,15 +13428,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13462,14 +13459,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -13487,15 +13484,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -13515,14 +13512,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13542,13 +13539,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -13568,13 +13565,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13594,21 +13591,21 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 +; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -13728,22 +13725,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13757,22 +13754,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13782,22 +13779,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13809,21 +13806,21 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -13834,22 +13831,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13859,20 +13856,20 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13884,24 +13881,24 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14021,36 +14018,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14067,35 +14066,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 @@ -14108,39 +14107,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14155,34 +14156,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -14196,33 +14197,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -14236,33 +14237,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -14276,34 +14277,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -14416,36 +14417,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14462,35 +14465,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 @@ -14503,39 +14506,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14550,34 +14555,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -14591,33 +14596,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -14631,33 +14636,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -14673,34 +14678,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -14813,36 +14818,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14859,35 +14866,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -14900,39 +14907,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14947,34 +14956,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -14988,33 +14997,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -15028,33 +15037,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -15070,34 +15079,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -15213,38 +15222,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15258,38 +15267,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15299,41 +15308,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15346,35 +15355,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -15385,36 +15394,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15424,36 +15433,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15463,37 +15472,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15599,38 +15608,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15644,38 +15653,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15685,41 +15694,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15732,35 +15741,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -15771,36 +15780,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15810,36 +15819,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15851,37 +15860,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15988,38 +15997,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16033,38 +16042,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16074,41 +16083,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16121,35 +16130,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -16160,36 +16169,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16199,36 +16208,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16240,37 +16249,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16386,37 +16395,39 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16433,35 +16444,35 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB60_1 @@ -16474,39 +16485,41 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16521,34 +16534,34 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -16562,35 +16575,35 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -16604,33 +16617,33 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -16646,34 +16659,34 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -16785,39 +16798,39 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16831,38 +16844,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16872,41 +16885,41 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16919,35 +16932,35 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -16958,38 +16971,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16999,36 +17012,36 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17040,37 +17053,37 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 88a7a5f4d9c6b..3dbf6477a7cb8 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -14477,36 +14477,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14523,35 +14525,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 @@ -14564,39 +14566,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14611,34 +14615,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -14652,33 +14656,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 @@ -14692,33 +14696,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 @@ -14732,34 +14736,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 @@ -14872,36 +14876,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14918,35 +14924,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 @@ -14959,39 +14965,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15006,34 +15014,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -15047,33 +15055,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 @@ -15087,33 +15095,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 @@ -15129,34 +15137,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 @@ -15269,36 +15277,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15315,35 +15325,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB52_1 @@ -15356,39 +15366,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15403,34 +15415,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -15444,33 +15456,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 @@ -15484,33 +15496,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB52_1 @@ -15526,34 +15538,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -15669,38 +15681,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15714,38 +15726,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15755,41 +15767,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15802,35 +15814,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -15841,36 +15853,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15880,36 +15892,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15919,37 +15931,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16055,38 +16067,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16100,38 +16112,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16141,41 +16153,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16188,35 +16200,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -16227,36 +16239,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16266,36 +16278,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16307,37 +16319,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16444,38 +16456,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16489,38 +16501,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16530,41 +16542,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16577,35 +16589,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -16616,36 +16628,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16655,36 +16667,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16696,37 +16708,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16842,37 +16854,39 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16889,35 +16903,35 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -16930,39 +16944,41 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16977,34 +16993,34 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -17018,35 +17034,35 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -17060,33 +17076,33 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 -; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -17102,34 +17118,34 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 -; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -17241,39 +17257,39 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -17287,38 +17303,38 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17328,41 +17344,41 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17375,35 +17391,35 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -17414,38 +17430,38 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17455,36 +17471,36 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17496,37 +17512,37 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll index dde6671178af7..b9592a9ff9073 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -56,19 +56,19 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1 ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_movk_i32 s2, 0x100 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_movk_i32 s1, 0x100 ; GCN-NEXT: .LBB1_1: ; %bb3 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off glc +; GCN-NEXT: v_lshlrev_b64 v[3:4], 1, v[0:1] +; GCN-NEXT: v_add_co_u32_e32 v3, vcc, s0, v3 +; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc +; GCN-NEXT: global_load_short_d16_hi v0, v[3:4], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s1, v0 ; GCN-NEXT: s_cbranch_vccz .LBB1_1 ; GCN-NEXT: ; %bb.2: ; %bb2 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 10d6a54baf665..f7882e6f12022 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -2528,21 +2528,19 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_not_b32_e32 v0, v0 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_and_b32_e32 v2, s6, v3 +; VI-NEXT: v_not_b32_e32 v2, v2 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2623,25 +2621,23 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: v_not_b32_e32 v0, v0 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_and_b32_e32 v2, s6, v3 +; VI-NEXT: v_not_b32_e32 v2, v2 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB56_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar: @@ -2718,19 +2714,19 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_and_b32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_not_b32_e32 v0, v0 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_and_b32_e32 v0, s6, v4 +; VI-NEXT: v_not_b32_e32 v3, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB57_1 @@ -2810,27 +2806,25 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB58_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_and_b32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: v_not_b32_e32 v0, v0 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_and_b32_e32 v0, s6, v4 +; VI-NEXT: v_not_b32_e32 v3, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB58_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar: @@ -4304,20 +4298,18 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_i32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_max_i32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4396,24 +4388,22 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_i32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_max_i32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB88_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar: @@ -4488,18 +4478,18 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_max_i32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_max_i32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB89_1 @@ -4577,26 +4567,24 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: v_max_i32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_max_i32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar: @@ -4665,26 +4653,26 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: s_load_dword s3, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s3, s[4:5], 0x10 +; VI-NEXT: s_add_u32 s4, s4, 16 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_max_i32_e32 v0, s2, v1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_max_i32_e32 v2, s2, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -4771,32 +4759,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: s_load_dword s7, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x10 +; VI-NEXT: s_add_u32 s4, s4, 16 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB92_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_max_i32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_max_i32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB92_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: @@ -4878,24 +4866,24 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: s_load_dword s3, s[0:1], 0x0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s3, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: .LBB93_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_max_i32_e32 v0, s2, v1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_max_i32_e32 v2, s2, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB93_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -4981,30 +4969,30 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: s_load_dword s7, s[0:1], 0x0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: .LBB94_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_max_i32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_max_i32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB94_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: @@ -5563,20 +5551,18 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_u32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_max_u32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5655,24 +5641,22 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_u32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_max_u32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB102_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar: @@ -5747,18 +5731,18 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_max_u32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_max_u32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB103_1 @@ -5836,26 +5820,24 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: v_max_u32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_max_u32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar: @@ -5924,26 +5906,26 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: s_load_dword s3, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s3, s[4:5], 0x10 +; VI-NEXT: s_add_u32 s4, s4, 16 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB105_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_max_u32_e32 v0, s2, v1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_max_u32_e32 v2, s2, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB105_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -6030,32 +6012,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: s_load_dword s7, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x10 +; VI-NEXT: s_add_u32 s4, s4, 16 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB106_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_max_u32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_max_u32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB106_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -6145,30 +6127,30 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: s_load_dword s7, s[0:1], 0x0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: .LBB107_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_max_u32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_max_u32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB107_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: @@ -6727,20 +6709,18 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_u32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_min_u32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6819,24 +6799,22 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB115_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_u32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_min_u32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB115_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar: @@ -6911,18 +6889,18 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB116_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_min_u32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_min_u32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB116_1 @@ -7000,26 +6978,24 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB117_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: v_min_u32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_min_u32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB117_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar: @@ -7566,20 +7542,18 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_i32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_min_i32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7658,24 +7632,22 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_i32_e32 v0, s6, v1 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_min_i32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB125_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar: @@ -7750,18 +7722,18 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_min_i32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_min_i32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB126_1 @@ -7839,26 +7811,24 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: v_min_i32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_min_i32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB127_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar: @@ -7927,26 +7897,26 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: s_load_dword s3, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s3, s[4:5], 0x10 +; VI-NEXT: s_add_u32 s4, s4, 16 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_min_i32_e32 v0, s2, v1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_min_i32_e32 v2, s2, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -8033,32 +8003,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: s_load_dword s7, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x10 +; VI-NEXT: s_add_u32 s4, s4, 16 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB129_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_min_i32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_min_i32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB129_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: @@ -8131,25 +8101,25 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: s_load_dword s4, s[4:5], 0x2c -; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s5, s[0:1], 0x0 +; VI-NEXT: s_load_dword s3, s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: .LBB130_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_min_i32_e32 v0, s4, v1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_min_i32_e32 v2, s2, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB130_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -8230,30 +8200,30 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: s_load_dword s7, s[0:1], 0x0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: .LBB131_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_min_i32_e32 v0, s6, v1 -; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_min_i32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB131_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 645bb0b117ccb..59a99a6a0328d 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -2626,14 +2626,14 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB54_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, s7, v3 ; VI-NEXT: v_and_b32_e32 v6, s6, v2 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_not_b32_e32 v1, v0 ; VI-NEXT: v_not_b32_e32 v0, v6 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2730,17 +2730,15 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, s7, v3 ; VI-NEXT: v_and_b32_e32 v6, s6, v2 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: v_not_b32_e32 v1, v0 ; VI-NEXT: v_not_b32_e32 v0, v6 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2748,12 +2746,12 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret_offset_scalar: @@ -2839,22 +2837,22 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_and_b32_e32 v0, s7, v3 -; VI-NEXT: v_and_b32_e32 v6, s6, v2 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_not_b32_e32 v1, v0 -; VI-NEXT: v_not_b32_e32 v0, v6 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_and_b32_e32 v0, s7, v7 +; VI-NEXT: v_and_b32_e32 v1, s6, v6 +; VI-NEXT: v_not_b32_e32 v5, v0 +; VI-NEXT: v_not_b32_e32 v4, v1 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB56_1 @@ -2943,30 +2941,28 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_and_b32_e32 v0, s7, v3 -; VI-NEXT: v_and_b32_e32 v6, s6, v2 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: v_not_b32_e32 v1, v0 -; VI-NEXT: v_not_b32_e32 v0, v6 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_and_b32_e32 v0, s7, v7 +; VI-NEXT: v_and_b32_e32 v1, s6, v6 +; VI-NEXT: v_not_b32_e32 v5, v0 +; VI-NEXT: v_not_b32_e32 v4, v1 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB57_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_ret_offset_scalar: @@ -4438,45 +4434,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s6, 0 -; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB84_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v8, 1 -; SI-NEXT: v_readlane_b32 s6, v8, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4487,17 +4483,17 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB84_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4517,14 +4513,14 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4546,45 +4542,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s6, 0 -; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v8, 1 -; SI-NEXT: v_readlane_b32 s6, v8, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4594,31 +4590,29 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: .LBB85_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_offset_scalar: @@ -4627,14 +4621,14 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4657,45 +4651,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v6, s6, 0 -; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB86_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v6, 1 -; SI-NEXT: v_readlane_b32 s6, v6, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4706,23 +4700,23 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB86_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB86_1 @@ -4736,20 +4730,20 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB86_1 @@ -4765,45 +4759,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v6, s6, 0 -; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB87_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v6, 1 -; SI-NEXT: v_readlane_b32 s6, v6, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4813,31 +4807,29 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_offset_scalar: @@ -4846,20 +4838,20 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB87_1 @@ -4883,29 +4875,29 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB88_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4915,26 +4907,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; VI-NEXT: s_add_u32 s0, s0, s6 +; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v6, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v6, s2 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4951,24 +4943,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4997,17 +4989,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_mov_b32_e32 v8, s5 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB89_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -5034,68 +5026,68 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-LABEL: atomic_max_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v3 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; VI-NEXT: s_cbranch_execnz .LBB89_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[6:7] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[8:9] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB89_1 @@ -5124,29 +5116,29 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB90_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5158,30 +5150,30 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v6, s2 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -5190,24 +5182,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5235,17 +5227,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_mov_b32_e32 v8, s5 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -5274,64 +5266,64 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s6 -; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: s_add_u32 s6, s0, s6 +; VI-NEXT: s_addc_u32 s7, s1, s7 +; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v3 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[6:7] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 @@ -5904,45 +5896,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s6, 0 -; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB98_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB98_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v8, 1 -; SI-NEXT: v_readlane_b32 s6, v8, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5953,17 +5945,17 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB98_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -5983,14 +5975,14 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6012,45 +6004,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s6, 0 -; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v8, 1 -; SI-NEXT: v_readlane_b32 s6, v8, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6060,31 +6052,29 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: .LBB99_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB99_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_offset_scalar: @@ -6093,14 +6083,14 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6123,45 +6113,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v6, s6, 0 -; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB100_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v6, 1 -; SI-NEXT: v_readlane_b32 s6, v6, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6172,23 +6162,23 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB100_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB100_1 @@ -6202,20 +6192,20 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB100_1 @@ -6231,45 +6221,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v6, s6, 0 -; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v6, 1 -; SI-NEXT: v_readlane_b32 s6, v6, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6279,31 +6269,29 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_offset_scalar: @@ -6312,20 +6300,20 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB101_1 @@ -6349,29 +6337,29 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6381,26 +6369,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; VI-NEXT: s_add_u32 s0, s0, s6 +; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v6, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v6, s2 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6417,24 +6405,24 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6463,17 +6451,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_mov_b32_e32 v8, s5 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB103_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -6500,68 +6488,68 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v3 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; VI-NEXT: s_cbranch_execnz .LBB103_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[6:7] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[8:9] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB103_1 @@ -6589,17 +6577,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_mov_b32_e32 v8, s5 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -6628,64 +6616,64 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s6 -; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: s_add_u32 s6, s0, s6 +; VI-NEXT: s_addc_u32 s7, s1, s7 +; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v3 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[6:7] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB104_1 @@ -7258,45 +7246,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s6, 0 -; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB111_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v8, 1 -; SI-NEXT: v_readlane_b32 s6, v8, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7307,17 +7295,17 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB111_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -7337,14 +7325,14 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7366,45 +7354,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s6, 0 -; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB112_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v8, 1 -; SI-NEXT: v_readlane_b32 s6, v8, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7414,31 +7402,29 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: .LBB112_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB112_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_offset_scalar: @@ -7447,14 +7433,14 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7477,45 +7463,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v6, s6, 0 -; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB113_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB113_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v6, 1 -; SI-NEXT: v_readlane_b32 s6, v6, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7526,23 +7512,23 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB113_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB113_1 @@ -7556,20 +7542,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB113_1 @@ -7585,45 +7571,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v6, s6, 0 -; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB114_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v6, 1 -; SI-NEXT: v_readlane_b32 s6, v6, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7633,31 +7619,29 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_offset_scalar: @@ -7666,20 +7650,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB114_1 @@ -8248,45 +8232,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s6, 0 -; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB121_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v8, 1 -; SI-NEXT: v_readlane_b32 s6, v8, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8297,17 +8281,17 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB121_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -8327,14 +8311,14 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8356,45 +8340,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s6, 0 -; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB122_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v8, 1 -; SI-NEXT: v_readlane_b32 s6, v8, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8404,31 +8388,29 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: .LBB122_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB122_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_offset_scalar: @@ -8437,14 +8419,14 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8467,45 +8449,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v6, s6, 0 -; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB123_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB123_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v6, 1 -; SI-NEXT: v_readlane_b32 s6, v6, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8516,23 +8498,23 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB123_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB123_1 @@ -8546,20 +8528,20 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB123_1 @@ -8575,45 +8557,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v6, s6, 0 -; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB124_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc -; SI-NEXT: v_mov_b32_e32 v0, s34 -; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v6, 1 -; SI-NEXT: v_readlane_b32 s6, v6, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8623,31 +8605,29 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_offset_scalar: @@ -8656,20 +8636,20 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB124_1 @@ -8693,29 +8673,29 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB125_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8725,26 +8705,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; VI-NEXT: s_add_u32 s0, s0, s6 +; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v6, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v6, s2 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -8761,24 +8741,24 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -8807,17 +8787,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_mov_b32_e32 v8, s5 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB126_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -8844,68 +8824,68 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-LABEL: atomic_min_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v3 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; VI-NEXT: s_cbranch_execnz .LBB126_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[6:7] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[8:9] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB126_1 @@ -8927,35 +8907,35 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: .LBB127_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm @@ -8966,18 +8946,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v6, s2 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -8994,20 +8974,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -9034,17 +9014,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_mov_b32_e32 v8, s5 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB128_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -9073,64 +9053,64 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s6 -; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: s_add_u32 s6, s0, s6 +; VI-NEXT: s_addc_u32 s7, s1, s7 +; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: v_mov_b32_e32 v9, v3 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; VI-NEXT: s_cbranch_execnz .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[6:7] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 785114cdbf39a..7792422291998 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -328,12 +328,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -399,14 +400,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v4, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -757,14 +758,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1366,12 +1367,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -1437,14 +1439,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v4, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1795,14 +1797,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2404,12 +2406,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -2475,14 +2478,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v4, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2833,14 +2836,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -4051,12 +4054,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -4064,7 +4067,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -4080,16 +4083,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4163,20 +4166,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4188,14 +4191,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -4203,19 +4206,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s43 ; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s44 @@ -4226,8 +4229,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -4272,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 -; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4297,26 +4300,28 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -4326,18 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -4382,20 +4385,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 -; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4407,25 +4410,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -4435,18 +4440,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 @@ -4481,14 +4484,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 -; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -4497,7 +4500,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4510,16 +4513,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -4527,7 +4530,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -4535,19 +4540,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s12, s43 ; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1164-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -4584,14 +4586,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_mov_b32_e32 v41, 0 -; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 @@ -4600,7 +4602,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4613,22 +4615,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -4636,16 +4640,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s12, s43 ; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 @@ -4684,32 +4688,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41] ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4730,17 +4731,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 @@ -5093,23 +5094,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5122,6 +5123,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -5131,9 +5134,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 @@ -5228,9 +5229,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5239,6 +5240,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -5246,9 +5248,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5257,6 +5258,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -5264,13 +5267,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5353,9 +5353,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5363,15 +5363,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5379,6 +5379,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -5386,11 +5387,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5807,12 +5807,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[6:7], v[0:1] +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 @@ -5882,15 +5883,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6067,16 +6068,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6137,15 +6138,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -6189,24 +6191,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] -; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6560,16 +6563,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -6651,15 +6655,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -7598,12 +7602,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -7611,7 +7615,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -7627,16 +7631,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -7710,20 +7714,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX9-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7735,14 +7739,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -7750,19 +7754,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s43 ; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s44 @@ -7773,8 +7777,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -7819,20 +7823,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 -; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7844,26 +7848,28 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -7873,18 +7879,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -7929,20 +7933,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 -; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7954,25 +7958,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -7982,18 +7988,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 @@ -8028,14 +8032,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 -; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -8044,7 +8048,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8057,16 +8061,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -8074,7 +8078,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8082,19 +8088,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s12, s43 ; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1164-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -8131,14 +8134,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_mov_b32_e32 v41, 0 -; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 @@ -8147,7 +8150,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8160,22 +8163,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8183,16 +8188,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s12, s43 ; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 @@ -8231,32 +8236,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41] ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -8277,17 +8279,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 @@ -8640,23 +8642,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8669,6 +8671,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -8678,9 +8682,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 @@ -8775,9 +8777,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8786,6 +8788,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -8793,9 +8796,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8804,6 +8806,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8811,13 +8815,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -8900,9 +8901,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -8910,15 +8911,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8926,6 +8927,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8933,11 +8935,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 1f521f2444984..cb3291df891af 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -328,12 +328,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -399,14 +400,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v4, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -757,14 +758,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1366,12 +1367,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -1437,14 +1439,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v4, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1795,14 +1797,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2404,12 +2406,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -2475,14 +2478,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v4, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2833,14 +2836,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -4051,12 +4054,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -4064,7 +4067,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -4080,16 +4083,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4163,20 +4166,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4188,14 +4191,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -4203,19 +4206,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s43 ; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s44 @@ -4226,8 +4229,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -4272,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 -; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4297,26 +4300,28 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -4326,18 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -4382,20 +4385,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 -; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4407,25 +4410,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -4435,18 +4440,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 @@ -4481,14 +4484,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 -; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -4497,7 +4500,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX1164-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4510,16 +4513,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -4527,7 +4530,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -4535,19 +4540,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s12, s43 ; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1164-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -4584,14 +4586,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_mov_b32_e32 v41, 0 -; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 @@ -4600,7 +4602,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4613,22 +4615,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -4636,16 +4640,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s12, s43 ; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 @@ -4684,32 +4688,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41] ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4730,17 +4731,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 @@ -5093,23 +5094,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5122,6 +5123,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -5131,9 +5134,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 @@ -5228,9 +5229,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5239,6 +5240,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -5246,9 +5248,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5257,6 +5258,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -5264,13 +5267,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5353,9 +5353,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5363,15 +5363,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5379,6 +5379,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -5386,11 +5387,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5807,12 +5807,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[6:7], v[0:1] +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 @@ -5882,15 +5883,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6067,16 +6068,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6137,15 +6138,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -6189,24 +6191,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] -; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6560,16 +6563,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -6651,15 +6655,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -7598,12 +7602,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -7611,7 +7615,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -7627,16 +7631,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -7710,20 +7714,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX9-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7735,14 +7739,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -7750,19 +7754,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s43 ; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s44 @@ -7773,8 +7777,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -7819,20 +7823,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 -; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7844,26 +7848,28 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -7873,18 +7879,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -7929,20 +7933,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 -; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7954,25 +7958,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -7982,18 +7988,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 @@ -8028,14 +8032,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 -; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -8044,7 +8048,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX1164-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8057,16 +8061,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -8074,7 +8078,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8082,19 +8088,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s12, s43 ; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1164-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -8131,14 +8134,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_mov_b32_e32 v41, 0 -; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 @@ -8147,7 +8150,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] +; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8160,22 +8163,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8183,16 +8188,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s12, s43 ; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 @@ -8231,32 +8236,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41] ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -8277,17 +8279,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 @@ -8640,23 +8642,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8669,6 +8671,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -8678,9 +8682,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 @@ -8775,9 +8777,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8786,6 +8788,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -8793,9 +8796,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8804,6 +8806,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8811,13 +8815,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -8900,9 +8901,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -8910,15 +8911,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8926,6 +8927,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8933,11 +8935,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index f1d58814f9f04..5c5a769178dd9 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -54,18 +54,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5] ; GFX11-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-NEXT: s_load_b32 s24, s[16:17], 0x24 +; GFX11-NEXT: s_load_b32 s19, s[16:17], 0x24 ; GFX11-NEXT: s_mov_b32 s12, s13 ; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s19, 0 +; GFX11-NEXT: s_mov_b32 s20, 0 ; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: s_mov_b32 s3, exec_lo ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, s19, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB2_13 @@ -74,7 +74,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mov_b32 s18, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitcmp1_b32 s21, 0 -; GFX11-NEXT: s_cselect_b32 s19, -1, 0 +; GFX11-NEXT: s_cselect_b32 s24, -1, 0 ; GFX11-NEXT: s_bitcmp0_b32 s21, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX11-NEXT: ; %bb.2: ; %bb15 @@ -110,60 +110,58 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 -; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_hi_u32 s0, s29, s28 ; GFX11-NEXT: s_mul_i32 s1, s29, s28 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s13, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s0, s0, s30 -; GFX11-NEXT: s_mul_i32 s0, s0, s22 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s0, s0, s22 ; GFX11-NEXT: s_mul_i32 s0, s0, s20 -; GFX11-NEXT: s_or_b32 s0, s24, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s19, s0 ; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 -; GFX11-NEXT: global_load_u16 v0, v0, s[20:21] +; GFX11-NEXT: s_mov_b32 s0, s1 +; GFX11-NEXT: global_load_u16 v1, v0, s[20:21] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s19 +; GFX11-NEXT: v_readfirstlane_b32 s13, v0 ; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: v_readfirstlane_b32 s20, v0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 ; GFX11-NEXT: s_and_b32 s1, s8, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_and_b32 s1, s1, exec_lo -; GFX11-NEXT: v_readfirstlane_b32 s21, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_cselect_b32 s1, s21, s20 -; GFX11-NEXT: s_and_b32 s20, 0xffff, s13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s19, v2 +; GFX11-NEXT: s_cselect_b32 s1, s19, s13 +; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 ; GFX11-NEXT: s_and_b32 s1, s1, 1 -; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_cselect_b32 s20, -1, 0 -; GFX11-NEXT: s_and_b32 s22, s9, exec_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s20 -; GFX11-NEXT: v_readfirstlane_b32 s20, v1 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-NEXT: s_and_b32 s20, s9, exec_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 +; GFX11-NEXT: v_readfirstlane_b32 s13, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s21, v0 -; GFX11-NEXT: s_cselect_b32 s20, s21, s20 +; GFX11-NEXT: v_readfirstlane_b32 s19, v2 +; GFX11-NEXT: s_cselect_b32 s13, s19, s13 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_bitcmp1_b32 s20, 0 -; GFX11-NEXT: s_cselect_b32 s20, 0x100, 0 -; GFX11-NEXT: s_or_b32 s13, s20, s13 +; GFX11-NEXT: s_bitcmp1_b32 s13, 0 +; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0 +; GFX11-NEXT: s_or_b32 s0, s13, s0 ; GFX11-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-NEXT: ; %bb.7: ; %Flow ; GFX11-NEXT: s_mov_b32 s0, 0 @@ -181,7 +179,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: ; %bb.11: ; %Flow6 ; GFX11-NEXT: s_mov_b32 s18, -1 ; GFX11-NEXT: .LBB2_12: ; %Flow11 -; GFX11-NEXT: s_and_b32 s19, s2, exec_lo +; GFX11-NEXT: s_and_b32 s20, s2, exec_lo ; GFX11-NEXT: s_or_not1_b32 s0, s18, exec_lo ; GFX11-NEXT: .LBB2_13: ; %Flow9 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -198,10 +196,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_or_b32 s19, s19, exec_lo +; GFX11-NEXT: s_or_b32 s20, s20, exec_lo ; GFX11-NEXT: .LBB2_15: ; %Flow14 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: s_and_saveexec_b32 s0, s19 +; GFX11-NEXT: s_and_saveexec_b32 s0, s20 ; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock ; GFX11-NEXT: ; divergent unreachable ; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir index 67cdf196a4693..dd478f94e1039 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir @@ -32,38 +32,38 @@ body: | ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; GCN-NEXT: liveins: $vcc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc ; GCN-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY1]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_1]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_2:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY2]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_2]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_3:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY3]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_3]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_4:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY4]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_4]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_5:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY5]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_5]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_6:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY6]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_6]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_7:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY7]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_7]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_8:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY8]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_8]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_9:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY9]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_9]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_10]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_11]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_12]], implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; GCN-NEXT: liveins: $vcc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_1]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_2]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_3]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_4]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_5]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_6]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_7]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_8]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_9]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_10]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_11]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_12]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_13]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll index b079637103fa4..eb2d95e4db2d5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll @@ -6,20 +6,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %i ; CHECK-LABEL: struct_atomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB0_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -67,20 +67,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i3 ; CHECK-LABEL: struct_atomic_buffer_load_i32_off: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB2_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -99,20 +99,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i ; CHECK-LABEL: struct_atomic_buffer_load_i32_soff: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB3_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 4 idxen offset:4 glc +; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -130,20 +130,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i3 ; CHECK-LABEL: struct_atomic_buffer_load_i32_dlc: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB4_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 dlc +; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB4_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -194,20 +194,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i ; CHECK-LABEL: struct_atomic_buffer_load_i64: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: .LBB6_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s4 -; CHECK-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1] +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -227,20 +227,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 ; CHECK-LABEL: struct_atomic_buffer_load_v2i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB7_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB7_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -260,23 +260,23 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; CHECK-LABEL: struct_atomic_buffer_load_v4i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB8_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CHECK-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB8_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -297,20 +297,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 ; CHECK-LABEL: struct_atomic_buffer_load_v4i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB9_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b128 v[1:4], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB9_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -330,22 +330,22 @@ define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %i ; CHECK-LABEL: struct_atomic_buffer_load_ptr: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB10_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_load_b32 v1, v[1:2] +; CHECK-NEXT: flat_load_b32 v2, v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB10_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll index e056f05aea451..bc50b12b59049 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll @@ -6,20 +6,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB0_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -67,20 +67,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_off: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB2_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -99,20 +99,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace( ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_soff: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB3_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 4 idxen offset:4 glc +; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -130,20 +130,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_dlc: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB4_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 dlc +; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB4_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -194,20 +194,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i64: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: .LBB6_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s4 -; CHECK-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1] +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -227,20 +227,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v2i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB7_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB7_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -260,23 +260,23 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB8_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CHECK-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB8_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -297,20 +297,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB9_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b128 v[1:4], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB9_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -330,22 +330,22 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %p ; CHECK-LABEL: struct_ptr_atomic_buffer_load_ptr: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB10_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_load_b32 v1, v[1:2] +; CHECK-NEXT: flat_load_b32 v2, v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 -; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB10_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index f961b7d9d5223..23b57a7efa586 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -5818,38 +5818,39 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: v_dual_add_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v5, v2 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5864,33 +5865,33 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v5, v2 -; GFX10-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 -; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -5904,29 +5905,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5943,29 +5944,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v6, v2 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5983,30 +5984,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v6, v2 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6124,38 +6125,39 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: v_dual_add_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v5, v2 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6170,33 +6172,33 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v5, v2 -; GFX10-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 -; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -6210,29 +6212,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6249,29 +6251,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v6, v2 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6289,30 +6291,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v6, v2 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6430,39 +6432,39 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: ds_load_b32 v3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_add_f32_e32 v4, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6475,34 +6477,34 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: ds_read_b32 v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_add_f32_e32 v4, v6, v5 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -6513,35 +6515,35 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6551,35 +6553,35 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v2, v0 +; GFX908-NEXT: ds_read_b32 v3, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX908-NEXT: v_add_f32_e32 v4, v6, v5 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6590,36 +6592,36 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_add_f32_e32 v4, v6, v5 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6727,39 +6729,39 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_add_f32_e32 v4, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6772,34 +6774,34 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_add_f32_e32 v4, v6, v5 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -6810,35 +6812,35 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6848,35 +6850,35 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX908-NEXT: v_add_f32_e32 v4, v6, v5 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6887,36 +6889,36 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_add_f32_e32 v4, v6, v5 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7628,22 +7630,21 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_cbranch_execz .LBB28_4 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_lshl_b32 s10, s3, 3 -; GFX7-NEXT: v_mov_b32_e32 v1, s10 -; GFX7-NEXT: ds_read_b32 v1, v1 +; GFX7-NEXT: s_lshl_b32 s8, s3, 3 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: ds_read_b32 v1, v2 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s10 -; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_2 @@ -7659,23 +7660,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB28_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s3, s3, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: ds_read_b32 v2, v1 +; GFX7-NEXT: s_lshl_b32 s0, s3, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_read_b32 v3, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, s3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 ; GFX7-NEXT: .LBB28_7: ; %Flow22 @@ -7710,24 +7710,23 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB28_13 ; GFX7-NEXT: ; %bb.10: -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: ds_read_b32 v2, v3 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_add_f32_e32 v4, v3, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 -; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX7-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX7-NEXT: s_cbranch_execnz .LBB28_11 ; GFX7-NEXT: ; %bb.12: ; %Flow -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7-NEXT: .LBB28_13: ; %Flow20 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -7755,22 +7754,21 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_cbranch_execz .LBB28_4 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_lshl_b32 s10, s3, 3 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: ds_read_b32 v1, v1 +; GFX6-NEXT: s_lshl_b32 s8, s3, 3 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ds_read_b32 v1, v2 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_2 @@ -7786,23 +7784,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB28_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s3, s3, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: ds_read_b32 v2, v1 +; GFX6-NEXT: s_lshl_b32 s0, s3, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_read_b32 v3, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, s3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 ; GFX6-NEXT: .LBB28_7: ; %Flow20 @@ -7837,24 +7834,23 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB28_13 ; GFX6-NEXT: ; %bb.10: -; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: ds_read_b32 v2, v3 +; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_add_f32_e32 v4, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 -; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX6-NEXT: s_cbranch_execnz .LBB28_11 ; GFX6-NEXT: ; %bb.12: ; %Flow -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX6-NEXT: .LBB28_13: ; %Flow18 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -8471,22 +8467,21 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_cbranch_execz .LBB29_4 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_lshl_b32 s10, s3, 3 -; GFX7-NEXT: v_mov_b32_e32 v1, s10 -; GFX7-NEXT: ds_read_b32 v1, v1 +; GFX7-NEXT: s_lshl_b32 s8, s3, 3 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: ds_read_b32 v1, v2 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s10 -; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_2 @@ -8502,23 +8497,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB29_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s3, s3, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: ds_read_b32 v2, v1 +; GFX7-NEXT: s_lshl_b32 s0, s3, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_read_b32 v3, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, s3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 ; GFX7-NEXT: .LBB29_7: ; %Flow22 @@ -8553,24 +8547,23 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB29_13 ; GFX7-NEXT: ; %bb.10: -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: ds_read_b32 v2, v3 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_add_f32_e32 v4, v3, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 -; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX7-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX7-NEXT: s_cbranch_execnz .LBB29_11 ; GFX7-NEXT: ; %bb.12: ; %Flow -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7-NEXT: .LBB29_13: ; %Flow20 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -8598,22 +8591,21 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_cbranch_execz .LBB29_4 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_lshl_b32 s10, s3, 3 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: ds_read_b32 v1, v1 +; GFX6-NEXT: s_lshl_b32 s8, s3, 3 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ds_read_b32 v1, v2 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_2 @@ -8629,23 +8621,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB29_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s3, s3, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: ds_read_b32 v2, v1 +; GFX6-NEXT: s_lshl_b32 s0, s3, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_read_b32 v3, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, s3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 ; GFX6-NEXT: .LBB29_7: ; %Flow20 @@ -8680,24 +8671,23 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB29_13 ; GFX6-NEXT: ; %bb.10: -; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: ds_read_b32 v2, v3 +; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_add_f32_e32 v4, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 -; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX6-NEXT: s_cbranch_execnz .LBB29_11 ; GFX6-NEXT: ; %bb.12: ; %Flow -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX6-NEXT: .LBB29_13: ; %Flow18 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 47058de71e7f4..d419b0cdfdd1a 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -4520,15 +4520,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4550,17 +4550,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX940-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v3 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB20_1 @@ -4573,15 +4573,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -4600,14 +4600,14 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4626,16 +4626,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 @@ -4649,16 +4649,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 @@ -4673,17 +4673,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v2, v6, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -4792,15 +4792,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4822,17 +4822,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX940-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v3 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_1 @@ -4845,15 +4845,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -4872,14 +4872,14 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4898,16 +4898,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 @@ -4921,16 +4921,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 @@ -4945,17 +4945,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v2, v6, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5064,14 +5064,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v3 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5093,13 +5093,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v3, v4, v3 +; GFX940-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5115,14 +5115,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v3 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5141,13 +5141,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5166,12 +5166,12 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v4, v3 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5188,12 +5188,12 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v3 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5209,23 +5209,23 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v4, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5326,14 +5326,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v3 +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5355,13 +5355,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v3, v4, v3 +; GFX940-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5377,14 +5377,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v3 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5403,13 +5403,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5428,12 +5428,12 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v4, v3 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5450,12 +5450,12 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v3 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5471,23 +5471,23 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v4, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5594,36 +5594,37 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_dual_max_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v5, v2 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5640,30 +5641,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v6, v2 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5679,38 +5680,39 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: v_dual_max_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v5, v2 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5725,33 +5727,33 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v5, v2 -; GFX10-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 -; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -5765,29 +5767,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5804,29 +5806,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v6, v2 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5844,30 +5846,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v6, v2 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5969,36 +5971,37 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_dual_max_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v5, v2 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6015,30 +6018,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v6, v2 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6054,38 +6057,39 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: v_dual_max_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v5, v2 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6100,33 +6104,33 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v5, v2 -; GFX10-NEXT: v_max_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 -; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -6140,29 +6144,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6179,29 +6183,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v6, v2 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6219,30 +6223,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v6, v2 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6344,37 +6348,37 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: ds_load_b32 v3, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_max_num_f32_e32 v4, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6388,36 +6392,36 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v2, v0 +; GFX940-NEXT: ds_read_b32 v3, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6427,39 +6431,39 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: ds_load_b32 v3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6472,34 +6476,34 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: ds_read_b32 v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -6510,35 +6514,35 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6548,35 +6552,35 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v2, v0 +; GFX908-NEXT: ds_read_b32 v3, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX908-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6587,36 +6591,36 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6708,37 +6712,37 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_max_num_f32_e32 v4, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6752,36 +6756,36 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX940-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX940-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6791,39 +6795,39 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6836,34 +6840,34 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -6874,35 +6878,35 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6912,35 +6916,35 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX908-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6951,36 +6955,36 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_max_f32_e32 v4, v6, v5 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 1c9cff7326d65..282947afa409a 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -4520,15 +4520,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4550,17 +4550,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX940-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v3 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB20_1 @@ -4573,15 +4573,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 -; GFX11-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -4600,14 +4600,14 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4626,16 +4626,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 @@ -4649,16 +4649,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 @@ -4673,17 +4673,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v2, v6, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -4792,15 +4792,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4822,17 +4822,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX940-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v3 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_1 @@ -4845,15 +4845,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 -; GFX11-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -4872,14 +4872,14 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4898,16 +4898,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 @@ -4921,16 +4921,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 @@ -4945,17 +4945,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v2, v6, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5064,14 +5064,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v3 +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5093,13 +5093,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v3, v4, v3 +; GFX940-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5115,14 +5115,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v4, v3 +; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5141,13 +5141,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v4, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5166,12 +5166,12 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v4, v3 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5188,12 +5188,12 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v4, v3 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5209,23 +5209,23 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v4, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5326,14 +5326,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v3 +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5355,13 +5355,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v3, v4, v3 +; GFX940-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5377,14 +5377,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v4, v3 +; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5403,13 +5403,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v4, v3 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5428,12 +5428,12 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v4, v3 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5450,12 +5450,12 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v4, v3 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5471,23 +5471,23 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v4, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5594,36 +5594,37 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_dual_min_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v5, v2 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5640,30 +5641,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX940-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v6, v2 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5679,38 +5680,39 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: v_dual_min_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v5, v2 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5725,33 +5727,33 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v5, v2 -; GFX10-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 -; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -5765,29 +5767,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX90A-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5804,29 +5806,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v6, v2 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5844,30 +5846,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v6, v2 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5969,36 +5971,37 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_dual_min_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v5, v2 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6015,30 +6018,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX940-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v6, v2 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6054,38 +6057,39 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: v_dual_min_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v5, v2 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6100,33 +6104,33 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v5, v2 -; GFX10-NEXT: v_min_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 -; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -6140,29 +6144,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX90A-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6179,29 +6183,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v6, v2 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6219,30 +6223,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v6, v2 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6344,37 +6348,37 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: ds_load_b32 v3, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_min_num_f32_e32 v4, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6388,36 +6392,36 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v2, v0 +; GFX940-NEXT: ds_read_b32 v3, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6427,39 +6431,39 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: ds_load_b32 v3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6472,34 +6476,34 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: ds_read_b32 v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -6510,35 +6514,35 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6548,35 +6552,35 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v2, v0 +; GFX908-NEXT: ds_read_b32 v3, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX908-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6587,36 +6591,36 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6708,37 +6712,37 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_min_num_f32_e32 v4, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6752,36 +6756,36 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX940-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX940-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6791,39 +6795,39 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6836,34 +6840,34 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -6874,35 +6878,35 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6912,35 +6916,35 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX908-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6951,36 +6955,36 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_min_f32_e32 v4, v6, v5 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 2433eca80b23c..1b08b64b046b4 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -6390,36 +6390,37 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v5, v2 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6436,30 +6437,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX940-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v6, v2 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX940-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6475,38 +6476,39 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v5, v2 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6521,33 +6523,33 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v5, v2 -; GFX10-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 -; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -6561,29 +6563,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX90A-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6600,29 +6602,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v6, v2 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6640,30 +6642,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v6, v2 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6765,36 +6767,37 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v5, v2 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6811,30 +6814,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX940-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v6, v2 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX940-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6850,38 +6853,39 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v5, v2 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6896,33 +6900,33 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v5, v2 -; GFX10-NEXT: v_sub_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 -; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -6936,29 +6940,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX90A-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6975,29 +6979,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v6, v2 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7015,30 +7019,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v6, v2 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7140,37 +7144,37 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: ds_load_b32 v3, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7184,36 +7188,36 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v2, v0 +; GFX940-NEXT: ds_read_b32 v3, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7223,39 +7227,39 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: ds_load_b32 v3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7268,34 +7272,34 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: ds_read_b32 v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7306,35 +7310,35 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7344,35 +7348,35 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v2, v0 +; GFX908-NEXT: ds_read_b32 v3, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX908-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7383,36 +7387,36 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7504,37 +7508,37 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX12-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7548,36 +7552,36 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX940-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] -; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7587,39 +7591,39 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 -; GFX11-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7632,34 +7636,34 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX10-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -7670,35 +7674,35 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7708,35 +7712,35 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX908-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX908-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 -; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7747,36 +7751,36 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_sub_f32_e32 v4, v6, v5 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index be8a6295c8a71..8157b1a7f7c80 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -161,13 +161,13 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; CHECK-NEXT: s_cbranch_execnz .LBB1_8 ; CHECK-NEXT: .LBB1_5: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: s_mov_b64 s[6:7], 0 -; CHECK-NEXT: .LBB1_6: ; %loop-memcpy-residual -; CHECK-NEXT: s_add_u32 s6, s6, 1 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_mov_b64 s[6:7], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; CHECK-NEXT: .LBB1_6: ; %loop-memcpy-residual +; CHECK-NEXT: s_add_u32 s4, s6, 1 +; CHECK-NEXT: s_addc_u32 s5, s7, 0 +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] ; CHECK-NEXT: s_mov_b64 s[6:7], 1 ; CHECK-NEXT: s_cbranch_vccnz .LBB1_6 ; CHECK-NEXT: ; %bb.7: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index 86a6ad46e8768..a9b8663a48dea 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -5,21 +5,23 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: s_mov_b32 s0, 0 +; GFX942-NEXT: s_mov_b32 s2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX942-NEXT: s_mov_b32 s1, 0 +; GFX942-NEXT: s_mov_b32 s3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_cmp_lg_u32 s2, 0 -; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX942-NEXT: s_or_b32 s4, s1, 1 -; GFX942-NEXT: s_ashr_i32 s5, s1, 31 -; GFX942-NEXT: s_mov_b32 s1, s0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-NEXT: s_or_b32 s4, s3, 1 +; GFX942-NEXT: s_ashr_i32 s5, s3, 31 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 @@ -27,54 +29,56 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX942-NEXT: s_and_b32 s1, s5, s4 +; GFX942-NEXT: s_and_b32 s3, s5, s4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3] ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: -; GFX942-NEXT: ; implicit-def: $sgpr1 +; GFX942-NEXT: ; implicit-def: $sgpr3 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm ; ; GFX908-LABEL: matmul_kernel: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: s_mov_b32 s0, 0 -; GFX908-NEXT: s_mov_b32 s1, 0 +; GFX908-NEXT: s_mov_b32 s2, 0 +; GFX908-NEXT: s_mov_b32 s3, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_cmp_lg_u32 s2, 0 -; GFX908-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX908-NEXT: s_cmp_lg_u32 s0, 0 +; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GFX908-NEXT: s_branch .LBB0_2 ; GFX908-NEXT: .LBB0_1: ; %bb2 ; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX908-NEXT: s_or_b32 s4, s1, 1 -; GFX908-NEXT: s_ashr_i32 s5, s1, 31 -; GFX908-NEXT: s_mov_b32 s1, s0 +; GFX908-NEXT: s_or_b32 s4, s3, 1 +; GFX908-NEXT: s_ashr_i32 s5, s3, 31 +; GFX908-NEXT: s_mov_b32 s3, s2 ; GFX908-NEXT: s_nop 3 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_mov_b32_e32 v5, s1 -; GFX908-NEXT: v_mov_b32_e32 v4, s0 +; GFX908-NEXT: v_mov_b32_e32 v5, s3 +; GFX908-NEXT: v_mov_b32_e32 v4, s2 ; GFX908-NEXT: v_mov_b32_e32 v2, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: s_and_b32 s1, s5, s4 +; GFX908-NEXT: s_and_b32 s3, s5, s4 ; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3] ; GFX908-NEXT: s_cbranch_execz .LBB0_4 ; GFX908-NEXT: .LBB0_2: ; %bb ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX908-NEXT: s_cbranch_vccz .LBB0_1 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: ; implicit-def: $sgpr1 +; GFX908-NEXT: ; implicit-def: $sgpr3 ; GFX908-NEXT: .LBB0_4: ; %common.ret ; GFX908-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index c80254210109a..3e45a2d0df43d 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -23,8 +23,10 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: ; Child Loop BB0_4 Depth 2 ; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; GCN-NEXT: s_mov_b32 s12, s6 ; GCN-NEXT: s_branch .LBB0_4 ; GCN-NEXT: .LBB0_3: ; %Flow1 @@ -34,7 +36,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: .LBB0_4: ; %bb2 ; GCN-NEXT: ; Parent Loop BB0_2 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_lshl_b32 s12, s12, 5 ; GCN-NEXT: s_cbranch_vccz .LBB0_6 ; GCN-NEXT: ; %bb.5: ; in Loop: Header=BB0_4 Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index cb9f6c1f38eea..96dd6276f7e38 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1744,8 +1744,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index c1cc61bbacd0f..23364e860d154 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1865,8 +1865,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 15dc545175055..12eec4fa3bd59 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -32,51 +32,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-LABEL: kernel: ; GLOBALNESS1: ; %bb.0: ; %bb ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS1-NEXT: s_load_dwordx4 s[64:67], s[8:9], 0x0 +; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[64:65] -; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 -; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 +; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[76:77] ; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s66, 0 -; GLOBALNESS1-NEXT: s_cselect_b64 s[68:69], -1, 0 +; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[70:71], s[68:69], -1 +; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s78, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], v[0:1] -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[64:65], s[4:5], 0 +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[72:73], s[4:5], -1 +; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_mov_b32 s60, s16 +; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 +; GLOBALNESS1-NEXT: s_mov_b32 s70, s16 ; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS1-NEXT: s_mov_b32 s61, s15 -; GLOBALNESS1-NEXT: s_mov_b32 s62, s14 +; GLOBALNESS1-NEXT: s_mov_b32 s71, s15 +; GLOBALNESS1-NEXT: s_mov_b32 s72, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] -; GLOBALNESS1-NEXT: s_xor_b64 s[74:75], s[4:5], -1 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[44:45], 0, v2 -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[46:47], 1, v2 -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v2 -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[50:51], 0, v2 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[50:51] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[60:61] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -105,23 +125,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s62 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s61 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s60 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[68:69] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lt_i32 s67, 1 +; GLOBALNESS1-NEXT: s_cmp_lt_i32 s79, 1 ; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s67, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_8 @@ -131,7 +151,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: .LBB1_8: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s67, 0 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 @@ -143,15 +163,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3] ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[52:53], 0, v0 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[76:77], s[52:53] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[74:75], s[62:63] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[44:45] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -163,70 +183,72 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[54:55], 0, v[0:1] -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[56:57], 0, v2 +; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS1-NEXT: s_branch .LBB1_16 ; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[74:75] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[70:71] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[72:73] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[42:43] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[64:65] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[44:45] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[54:55] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS1-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[46:47] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_add_u32 s58, s38, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s59, s39, 0 +; GLOBALNESS1-NEXT: s_add_u32 s68, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s69, s39, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS1-NEXT: s_load_dwordx2 s[78:79], s[4:5], 0x0 +; GLOBALNESS1-NEXT: s_load_dwordx2 s[76:77], s[4:5], 0x0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[58:59] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s62 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s61 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s60 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[78:79] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[58:59] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s62 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s61 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s60 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[78:79] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[56:57] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -242,12 +264,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[76:77] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[52:53] +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[74:75] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[48:49] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -271,9 +293,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s62 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s61 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s60 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 @@ -289,9 +311,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s62 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s61 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s60 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 @@ -302,51 +324,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS0-NEXT: s_load_dwordx4 s[64:67], s[8:9], 0x0 +; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[64:65] -; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 -; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 +; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[72:73] ; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 -; GLOBALNESS0-NEXT: s_bitcmp1_b32 s66, 0 -; GLOBALNESS0-NEXT: s_cselect_b64 s[68:69], -1, 0 +; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[70:71], s[68:69], -1 +; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], v[0:1] -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[64:65], s[4:5], 0 +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[72:73], s[4:5], -1 +; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_mov_b32 s58, s16 +; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 +; GLOBALNESS0-NEXT: s_mov_b32 s68, s16 ; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS0-NEXT: s_mov_b32 s59, s15 -; GLOBALNESS0-NEXT: s_mov_b32 s60, s14 +; GLOBALNESS0-NEXT: s_mov_b32 s69, s15 +; GLOBALNESS0-NEXT: s_mov_b32 s70, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] -; GLOBALNESS0-NEXT: s_xor_b64 s[74:75], s[4:5], -1 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[44:45], 0, v2 -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[46:47], 1, v2 -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v2 -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[50:51], 0, v2 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[50:51] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[60:61] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -375,23 +417,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s60 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s59 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s58 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[68:69] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lt_i32 s67, 1 +; GLOBALNESS0-NEXT: s_cmp_lt_i32 s75, 1 ; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s67, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_8 @@ -401,7 +443,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: .LBB1_8: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s67, 0 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25 @@ -413,15 +455,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3] ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[52:53], 0, v0 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[76:77], s[52:53] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[76:77], s[62:63] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[44:45] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -433,70 +475,72 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[54:55], 0, v[0:1] -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[56:57], 0, v2 +; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS0-NEXT: s_branch .LBB1_16 ; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[74:75] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[70:71] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[72:73] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[42:43] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[64:65] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[44:45] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[54:55] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS0-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[46:47] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_add_u32 s62, s38, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s63, s39, 0 +; GLOBALNESS0-NEXT: s_add_u32 s72, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s73, s39, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 ; GLOBALNESS0-NEXT: s_load_dwordx2 s[78:79], s[4:5], 0x0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[62:63] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s60 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s59 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s58 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[62:63] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s60 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s59 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s58 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[56:57] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -513,11 +557,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[76:77] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[52:53] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[48:49] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -541,9 +585,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s60 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s59 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s58 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 @@ -559,9 +603,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s60 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s59 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s58 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 089f3d255e87a..db7d816386a70 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1187,8 +1187,8 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll index 945b87b6e1628..0acee5bd5ac19 100644 --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -52,10 +52,13 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 { ; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: s_waitcnt expcnt(1) +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; CHECK-NEXT: .LBB1_1: ; %bb9 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz .LBB1_1 ; CHECK-NEXT: ; %bb.2: ; %bb11 ; CHECK-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index bc7b375cd404d..a794d139063d5 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -1294,8 +1294,8 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll index 41d324dd7abfc..0211c5111c31d 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -6,15 +6,15 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GCN-LABEL: vgpr_descriptor_waterfall_loop_idom_update: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: .LBB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 -; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 -; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo -; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] ; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: .LBB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir b/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir index fba410dc0dafc..7c8a5848b402f 100644 --- a/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir +++ b/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir @@ -17,5 +17,4 @@ body: | ... # CHECK-DAG: AllocationOrder(GPRC) = [ $r3 $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 $r12 $r0 $r31 $r30 $r29 $r28 $r27 $r26 $r25 $r24 $r23 $r22 $r21 $r20 $r19 $r18 $r17 $r16 $r15 $r14 $r13 ] -# CHECK-DAG: AllocationOrder(F4RC) = [ $f0 $f1 $f2 $f3 $f4 $f5 $f6 $f7 $f8 $f9 $f10 $f11 $f12 $f13 $f31 $f30 $f29 $f28 $f27 $f26 $f25 $f24 $f23 $f22 $f21 $f20 $f19 $f18 $f17 $f16 $f15 $f14 ] # CHECK-DAG: AllocationOrder(GPRC_and_GPRC_NOR0) = [ $r3 $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 $r12 $r31 $r30 $r29 $r28 $r27 $r26 $r25 $r24 $r23 $r22 $r21 $r20 $r19 $r18 $r17 $r16 $r15 $r14 $r13 ] diff --git a/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir b/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir index 584b6b0ad46dd..2f93a0ddcfb67 100644 --- a/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir +++ b/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir @@ -16,6 +16,6 @@ body: | $f1 = COPY %2 BLR8 implicit $lr8, implicit undef $rm, implicit $x3, implicit $f1 ... -# CHECK-DAG: AllocationOrder(VFRC) = [ $vf2 $vf3 $vf4 $vf5 $vf0 $vf1 $vf6 $vf7 $vf8 $vf9 $vf10 $vf11 $vf12 $vf13 $vf14 $vf15 $vf16 $vf17 $vf18 $vf19 $vf31 $vf30 $vf29 $vf28 $vf27 $vf26 $vf25 $vf24 $vf23 $vf22 $vf21 $vf20 ] # CHECK-DAG: AllocationOrder(G8RC_and_G8RC_NOX0) = [ $x3 $x4 $x5 $x6 $x7 $x8 $x9 $x10 $x11 $x12 $x2 $x31 $x30 $x29 $x28 $x27 $x26 $x25 $x24 $x23 $x22 $x21 $x20 $x19 $x18 $x17 $x16 $x15 $x14 ] +# CHECK-DAG: AllocationOrder(G8RC) = [ $x3 $x4 $x5 $x6 $x7 $x8 $x9 $x10 $x11 $x12 $x0 $x2 $x31 $x30 $x29 $x28 $x27 $x26 $x25 $x24 $x23 $x22 $x21 $x20 $x19 $x18 $x17 $x16 $x15 $x14 ] # CHECK-DAG: AllocationOrder(F8RC) = [ $f0 $f1 $f2 $f3 $f4 $f5 $f6 $f7 $f8 $f9 $f10 $f11 $f12 $f13 $f31 $f30 $f29 $f28 $f27 $f26 $f25 $f24 $f23 $f22 $f21 $f20 $f19 $f18 $f17 $f16 $f15 $f14 ] diff --git a/llvm/test/CodeGen/PowerPC/compute-regpressure.ll b/llvm/test/CodeGen/PowerPC/compute-regpressure.ll index 9a1b057c2e38d..9d893b8dbebee 100644 --- a/llvm/test/CodeGen/PowerPC/compute-regpressure.ll +++ b/llvm/test/CodeGen/PowerPC/compute-regpressure.ll @@ -1,7 +1,7 @@ ; REQUIRES: asserts -; RUN: llc -debug-only=regalloc < %s 2>&1 |FileCheck %s --check-prefix=DEBUG +; RUN: llc -debug-only=target-reg-info < %s 2>&1 |FileCheck %s --check-prefix=DEBUG -; DEBUG-COUNT-1: AllocationOrder(VRSAVERC) = [ ] +; DEBUG-COUNT-1: All registers of VRSAVERC are reserved! target triple = "powerpc64le-unknown-linux-gnu" diff --git a/llvm/test/TableGen/bare-minimum-psets.td b/llvm/test/TableGen/bare-minimum-psets.td index 25e0bd2a83d1d..170838dd5f01c 100644 --- a/llvm/test/TableGen/bare-minimum-psets.td +++ b/llvm/test/TableGen/bare-minimum-psets.td @@ -55,7 +55,7 @@ def MyTarget : Target; // CHECK-NEXT: } // CHECK: unsigned MyTargetGenRegisterInfo:: -// CHECK-NEXT: getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { +// CHECK-NEXT: getRawRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { // CHECK-NEXT: static const uint8_t PressureLimitTable[] = { // CHECK-NEXT: {{[0-9]+}}, // 0: D_32 // CHECK-NEXT: }; diff --git a/llvm/test/TableGen/inhibit-pset.td b/llvm/test/TableGen/inhibit-pset.td index 1f4f8a176c62c..b3443f1938e8c 100644 --- a/llvm/test/TableGen/inhibit-pset.td +++ b/llvm/test/TableGen/inhibit-pset.td @@ -15,7 +15,7 @@ def X0 : Register <"x0">; // CHECK-NEXT: } // CHECK: unsigned TestTargetGenRegisterInfo:: -// CHECK-NEXT: getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { +// CHECK-NEXT: getRawRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { // CHECK-NEXT: static const uint16_t PressureLimitTable[] = { // CHECK-NEXT: {{[0-9]+}}, // 0: GPR32 // CHECK-NEXT: }; diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index 9dc2d62f21d95..6633cec659d8e 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -121,29 +121,31 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_ashr_i32 s7, s6, 31 ; CHECK-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CHECK-NEXT: s_add_u32 s4, s0, s4 -; CHECK-NEXT: s_addc_u32 s5, s1, s5 -; CHECK-NEXT: s_add_u32 s0, s4, -8 -; CHECK-NEXT: s_addc_u32 s1, s5, -1 -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 9 -; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s4 +; CHECK-NEXT: s_addc_u32 s1, s1, s5 +; CHECK-NEXT: s_add_u32 s4, s0, -8 +; CHECK-NEXT: s_addc_u32 s5, s1, -1 +; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 9 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; CHECK-NEXT: .LBB3_1: ; %bb0 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb1 -; CHECK-NEXT: s_mov_b64 s[4:5], exec -; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: s_cbranch_execz .LBB3_4 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 +; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; CHECK-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: .LBB3_4: diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc index 425453fb0e280..08d43c6289c75 100644 --- a/llvm/unittests/CodeGen/MFCommon.inc +++ b/llvm/unittests/CodeGen/MFCommon.inc @@ -50,8 +50,8 @@ public: const char *getRegPressureSetName(unsigned Idx) const override { return "bogus"; } - unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const override { + unsigned getRawRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override { return 0; } const int * diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index 0c1f5d205ca0f..c3b1d78913acc 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -275,8 +275,8 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS, OS << "// Get the register unit pressure limit for this dimension.\n" << "// This limit must be adjusted dynamically for reserved registers.\n" << "unsigned " << ClassName << "::\n" - << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const " - "{\n" + << "getRawRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) " + "const {\n" << " static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32) << " PressureLimitTable[] = {\n"; for (unsigned i = 0; i < NumSets; ++i) { @@ -1128,7 +1128,8 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) { << " unsigned getRegUnitWeight(unsigned RegUnit) const override;\n" << " unsigned getNumRegPressureSets() const override;\n" << " const char *getRegPressureSetName(unsigned Idx) const override;\n" - << " unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned " + << " unsigned getRawRegPressureSetLimit(const MachineFunction &MF, " + "unsigned " "Idx) const override;\n" << " const int *getRegClassPressureSets(" << "const TargetRegisterClass *RC) const override;\n"