Skip to content

[AMDGPU] Insert readfirstlane in the function returns in sgpr. #135326

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

7 changes: 6 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3225,6 +3225,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
const SDLoc &DL, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();

if (AMDGPU::isKernel(CallConv)) {
return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
Expand All @@ -3251,6 +3252,8 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<SDValue, 48> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

SDValue ReadFirstLane =
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
// Copy the result values into the output registers.
for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
++I, ++RealRVLocIdx) {
Expand Down Expand Up @@ -3278,7 +3281,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
default:
llvm_unreachable("Unknown loc info!");
}

if (TRI->isSGPRPhysReg(VA.getLocReg()))
Arg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Arg.getValueType(),
ReadFirstLane, Arg);
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_i32 s1, s1, 1
; GFX9-NEXT: s_add_i32 s3, s3, 1
; GFX9-NEXT: s_add_i32 s1, s1, 1
; GFX9-NEXT: ; return to shader part epilog
%add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
ret <2 x i64> %add
Expand All @@ -158,8 +158,8 @@ define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64>
define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_i32 s1, s1, 1
; GFX9-NEXT: s_add_i32 s3, s3, 2
; GFX9-NEXT: s_add_i32 s1, s1, 1
; GFX9-NEXT: ; return to shader part epilog
%add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
ret <2 x i64> %add
Expand Down
48 changes: 32 additions & 16 deletions llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY8]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY9]]
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
ret double %ret
Expand All @@ -136,9 +138,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY9]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY10]]
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret double %ret
Expand All @@ -162,9 +166,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY9]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY10]]
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret double %ret
Expand All @@ -190,9 +196,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY10]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY11]]
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret double %ret
Expand Down Expand Up @@ -334,9 +342,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY12]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY13]]
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
ret double %ret
Expand Down Expand Up @@ -366,9 +376,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr a
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY13]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY14]]
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret double %ret
Expand Down Expand Up @@ -398,9 +410,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr a
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY13]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY14]]
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret double %ret
Expand Down Expand Up @@ -432,9 +446,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY14]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY15]]
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret double %ret
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/AMDGPU/constrained-shift.ll
Original file line number Diff line number Diff line change
Expand Up @@ -168,26 +168,26 @@ define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) {
define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b) {
; CHECK-LABEL: s_csh_v4i32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b32 s8, s0, s4
; CHECK-NEXT: s_lshl_b32 s9, s1, s5
; CHECK-NEXT: s_lshl_b32 s10, s2, s6
; CHECK-NEXT: s_lshl_b32 s11, s3, s7
; CHECK-NEXT: s_lshr_b32 s12, s0, s4
; CHECK-NEXT: s_lshr_b32 s13, s1, s5
; CHECK-NEXT: s_lshr_b32 s14, s2, s6
; CHECK-NEXT: s_lshr_b32 s15, s3, s7
; CHECK-NEXT: s_ashr_i32 s3, s3, s7
; CHECK-NEXT: s_ashr_i32 s2, s2, s6
; CHECK-NEXT: s_ashr_i32 s1, s1, s5
; CHECK-NEXT: s_lshl_b32 s8, s3, s7
; CHECK-NEXT: s_lshl_b32 s9, s2, s6
; CHECK-NEXT: s_lshl_b32 s10, s1, s5
; CHECK-NEXT: s_lshl_b32 s11, s0, s4
; CHECK-NEXT: s_lshr_b32 s12, s3, s7
; CHECK-NEXT: s_lshr_b32 s13, s2, s6
; CHECK-NEXT: s_lshr_b32 s14, s1, s5
; CHECK-NEXT: s_lshr_b32 s15, s0, s4
; CHECK-NEXT: s_ashr_i32 s0, s0, s4
; CHECK-NEXT: s_ashr_i32 s1, s1, s5
; CHECK-NEXT: s_ashr_i32 s2, s2, s6
; CHECK-NEXT: s_ashr_i32 s3, s3, s7
; CHECK-NEXT: s_add_i32 s4, s11, s15
; CHECK-NEXT: s_add_i32 s5, s10, s14
; CHECK-NEXT: s_add_i32 s6, s9, s13
; CHECK-NEXT: s_add_i32 s7, s8, s12
; CHECK-NEXT: s_add_i32 s0, s7, s0
; CHECK-NEXT: s_add_i32 s1, s6, s1
; CHECK-NEXT: s_add_i32 s2, s5, s2
; CHECK-NEXT: s_add_i32 s3, s4, s3
; CHECK-NEXT: s_add_i32 s3, s7, s3
; CHECK-NEXT: s_add_i32 s2, s6, s2
; CHECK-NEXT: s_add_i32 s1, s5, s1
; CHECK-NEXT: s_add_i32 s0, s4, s0
; CHECK-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: s_csh_v4i32:
Expand Down
Loading
Loading