llvm · PankajDwivedi-25 · Mar 18, 2025 · Mar 24, 2025 · Mar 26, 2025 · shiltian
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -899,6 +899,55 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
                                       MI, MI.getDebugLoc())) {
       I = std::next(I);
       MI.eraseFromParent();
+    } else {
+      // At this point, if we still have a VGPR → SGPR copy, it is completely
+      // illegal. We assume that it was intentionally introduced and should be
+      // replaced with the READFIRSTLANE to ensure correctness.
+      const TargetRegisterClass *SRC = MRI->getRegClass(SrcReg);
+      const TargetRegisterClass *DRC = TRI->getPhysRegBaseClass(DstReg);
+      ArrayRef<int16_t> SrcIndices = TRI->getRegSplitParts(SRC, 4);
+      ArrayRef<int16_t> DstIndices = TRI->getRegSplitParts(DRC, 4);
+      assert(SrcIndices.size() == DstIndices.size() &&
+             "Register tuple should match");
+      MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
+      // If SrcReg is virtual register can we get the sub reg? one way to handle
+      // this If def is copy fold the def src. This should not work in case of
+      // src is and AGPR. Since direct copy from agpr to sgpr is not allowed?
+      MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
+      // Get the def SrcReg
+      Register DefSrcReg = DefMI->getOperand(1).getReg();
+      // If not a register tuple replace the opcode itself.
+      if (SrcIndices.size() == 1) {
+        MI.setDesc(TII->get(AMDGPU::V_READFIRSTLANE_B32));
+        MI.addOperand(*MI.getParent()->getParent(),
+                      MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+        return true;
+      }
+
+      if (!DefMI || !DefMI->isCopy() || TRI->isAGPR(*MRI, DefSrcReg))
+        return true;
+
+      for (unsigned Idx = 0; Idx < SrcIndices.size(); ++Idx) {
+        int16_t SubIdx = SrcIndices[Idx];
+        Register DefSrcSubReg = TRI->getSubReg(DefSrcReg, SubIdx);
+        Register DstSubReg = TRI->getSubReg(DstReg, SubIdx);
+        assert(DstSubReg && DefSrcSubReg && "Failed to find subregs!");
+        LastMI = BuildMI(*MI.getParent(), I, MI.getDebugLoc(),
+                         TII->get(AMDGPU::V_READFIRSTLANE_B32), DstSubReg)
+                     .addReg(DefSrcSubReg)
+                     .addReg(DefSrcReg, RegState::Implicit);
+        if (!FirstMI)
+          FirstMI = LastMI;
+      }
+      assert(FirstMI && LastMI);
+
+      FirstMI->addOperand(
+          MachineOperand::CreateReg(DstReg, true /*IsDef*/, true /*IsImp*/));
+
+      LastMI->addRegisterKilled(DefSrcReg, TRI);
+      I = std::next(I);
+      MI.eraseFromParent();
+      DefMI->eraseFromParent();
     }
     return true;
   }

diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
@@ -1,22 +1,202 @@
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s
-
-; CHECK: illegal VGPR to SGPR copy
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -stop-after=finalize-isel | FileCheck %s -check-prefixes=GCN
 
 declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0
 declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0
 declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0
 
 define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 {
+  ; GCN-LABEL: name: test_call_external_void_func_a15i32_inreg
+  ; GCN: bb.0 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr31
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr31
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr29
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr28
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr27
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr26
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr25
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr24
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr23
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr22
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr21
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr20
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr19
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr18
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr17
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @external_void_func_a15i32_inreg, target-flags(amdgpu-rel32-hi) @external_void_func_a15i32_inreg, implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY23]]
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY22]]
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY21]]
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY20]]
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY19]]
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY18]]
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY17]]
+  ; GCN-NEXT:   $sgpr15 = COPY [[COPY16]]
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY]]
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY24]]
+  ; GCN-NEXT:   $sgpr0 = COPY [[COPY15]]
+  ; GCN-NEXT:   $sgpr1 = COPY [[COPY14]]
+  ; GCN-NEXT:   $sgpr2 = COPY [[COPY13]]
+  ; GCN-NEXT:   $sgpr3 = COPY [[COPY12]]
+  ; GCN-NEXT:   $sgpr16 = COPY [[COPY11]]
+  ; GCN-NEXT:   $sgpr17 = COPY [[COPY10]]
+  ; GCN-NEXT:   $sgpr18 = COPY [[COPY9]]
+  ; GCN-NEXT:   $sgpr19 = COPY [[COPY8]]
+  ; GCN-NEXT:   $sgpr20 = COPY [[COPY7]]
+  ; GCN-NEXT:   $sgpr21 = COPY [[COPY6]]
+  ; GCN-NEXT:   $sgpr22 = COPY [[COPY5]]
+  ; GCN-NEXT:   $sgpr23 = COPY [[COPY4]]
+  ; GCN-NEXT:   $sgpr24 = COPY [[COPY3]]
+  ; GCN-NEXT:   $sgpr25 = COPY [[COPY2]]
+  ; GCN-NEXT:   $sgpr26 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+  ; GCN-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @external_void_func_a15i32_inreg, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26
+  ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GCN-NEXT:   SI_RETURN
   call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0)
   ret void
 }
 
 define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 {
+  ; GCN-LABEL: name: test_call_external_void_func_a16i32_inreg
+  ; GCN: bb.0 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1, $vgpr31
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr31
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr29
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr28
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr27
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr26
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr25
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr24
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr23
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr22
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr21
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr20
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr19
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr18
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr17
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @external_void_func_a16i32_inreg, target-flags(amdgpu-rel32-hi) @external_void_func_a16i32_inreg, implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY25:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY24]]
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY23]]
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY22]]
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY21]]
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY20]]
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY19]]
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY18]]
+  ; GCN-NEXT:   $sgpr15 = COPY [[COPY17]]
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY]]
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY25]]
+  ; GCN-NEXT:   $sgpr0 = COPY [[COPY16]]
+  ; GCN-NEXT:   $sgpr1 = COPY [[COPY15]]
+  ; GCN-NEXT:   $sgpr2 = COPY [[COPY14]]
+  ; GCN-NEXT:   $sgpr3 = COPY [[COPY13]]
+  ; GCN-NEXT:   $sgpr16 = COPY [[COPY12]]
+  ; GCN-NEXT:   $sgpr17 = COPY [[COPY11]]
+  ; GCN-NEXT:   $sgpr18 = COPY [[COPY10]]
+  ; GCN-NEXT:   $sgpr19 = COPY [[COPY9]]
+  ; GCN-NEXT:   $sgpr20 = COPY [[COPY8]]
+  ; GCN-NEXT:   $sgpr21 = COPY [[COPY7]]
+  ; GCN-NEXT:   $sgpr22 = COPY [[COPY6]]
+  ; GCN-NEXT:   $sgpr23 = COPY [[COPY5]]
+  ; GCN-NEXT:   $sgpr24 = COPY [[COPY4]]
+  ; GCN-NEXT:   $sgpr25 = COPY [[COPY3]]
+  ; GCN-NEXT:   $sgpr26 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+  ; GCN-NEXT:   $sgpr27 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+  ; GCN-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @external_void_func_a16i32_inreg, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27
+  ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GCN-NEXT:   SI_RETURN
   call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0)
   ret void
 }
 
 define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 {
+  ; GCN-LABEL: name: test_call_external_void_func_a15i32_inreg_i32_inreg
+  ; GCN: bb.0 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr0, $vgpr1, $vgpr31
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr31
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr29
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr28
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr27
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr26
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr25
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr24
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr23
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr22
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr21
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr20
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr19
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr18
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr17
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN-NEXT:   [[COPY22:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN-NEXT:   [[COPY23:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN-NEXT:   [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @external_void_func_a15i32_inreg_i32_inreg, target-flags(amdgpu-rel32-hi) @external_void_func_a15i32_inreg_i32_inreg, implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY25:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY24]]
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY23]]
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY22]]
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY21]]
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY20]]
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY19]]
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY18]]
+  ; GCN-NEXT:   $sgpr15 = COPY [[COPY17]]
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY]]
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY25]]
+  ; GCN-NEXT:   $sgpr0 = COPY [[COPY16]]
+  ; GCN-NEXT:   $sgpr1 = COPY [[COPY15]]
+  ; GCN-NEXT:   $sgpr2 = COPY [[COPY14]]
+  ; GCN-NEXT:   $sgpr3 = COPY [[COPY13]]
+  ; GCN-NEXT:   $sgpr16 = COPY [[COPY12]]
+  ; GCN-NEXT:   $sgpr17 = COPY [[COPY11]]
+  ; GCN-NEXT:   $sgpr18 = COPY [[COPY10]]
+  ; GCN-NEXT:   $sgpr19 = COPY [[COPY9]]
+  ; GCN-NEXT:   $sgpr20 = COPY [[COPY8]]
+  ; GCN-NEXT:   $sgpr21 = COPY [[COPY7]]
+  ; GCN-NEXT:   $sgpr22 = COPY [[COPY6]]
+  ; GCN-NEXT:   $sgpr23 = COPY [[COPY5]]
+  ; GCN-NEXT:   $sgpr24 = COPY [[COPY4]]
+  ; GCN-NEXT:   $sgpr25 = COPY [[COPY3]]
+  ; GCN-NEXT:   $sgpr26 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+  ; GCN-NEXT:   $sgpr27 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+  ; GCN-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @external_void_func_a15i32_inreg_i32_inreg, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27
+  ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GCN-NEXT:   SI_RETURN
   call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1)
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=finalize-isel | FileCheck %s -check-prefixes=GFX11
+
+define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
+  ; GFX11-LABEL: name: s_copysign_f32_bf16
+  ; GFX11: bb.0 (%ir-block.0):
+  ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX11-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+  ; GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
+  ; GFX11-NEXT:   [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 killed [[S_MOV_B32_]], [[COPY1]], killed [[V_LSHLREV_B32_e64_]], implicit $exec
+  ; GFX11-NEXT:   $sgpr0 = V_READFIRSTLANE_B32 [[V_BFI_B32_e64_]], implicit $exec
+  ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $sgpr0
+  %sign = fpext bfloat %sign.bf16 to float
+  %op = call float @llvm.copysign.f32(float %mag, float %sign)
+  %cast = bitcast float %op to i32
+  ret i32 %cast
+}