arm
diff --git a/‎clang/include/clang/Basic/BuiltinsAMDGPU.td‎
Lines changed: 3 additions & 4 deletions b/‎clang/include/clang/Basic/BuiltinsAMDGPU.td‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl‎
Lines changed: 38 additions & 8 deletions b/‎clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl‎
Lines changed: 38 additions & 8 deletions
diff --git a/‎clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl‎
Lines changed: 3 additions & 5 deletions b/‎clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td‎
Lines changed: 7 additions & 24 deletions b/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td‎
Lines changed: 7 additions & 24 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp‎
Lines changed: 35 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h‎
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp‎
Lines changed: 0 additions & 21 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp‎
Lines changed: 44 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h‎
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp‎
Lines changed: 1 addition & 8 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp‎
Lines changed: 1 addition & 8 deletions
@@ -752,10 +752,9 @@ def __builtin_amdgcn_global_store_async_from_lds_b128 : AMDGPUBuiltin<"void(_Ext
 def __builtin_amdgcn_ds_atomic_async_barrier_arrive_b64 : AMDGPUBuiltin<"void(long int address_space<3> *)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64 : AMDGPUBuiltin<"long int(long int address_space<3> *, long int)", [Const], "gfx1250-insts">;
 
-def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_load_to_lds_d2 : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_store_from_lds_d2 : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
+
 
 def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
 def __builtin_amdgcn_global_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">;
 
@@ -5,42 +5,72 @@
 typedef int    v4i   __attribute__((ext_vector_type(4)));
 typedef int    v8i   __attribute__((ext_vector_type(8)));
 
+static v4i v4i_zeros = (v4i){0,0,0,0};
+static v8i v8i_zeros = (v8i){0,0,0,0,0,0,0,0};
+
 // CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds(
 // CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 0)
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 0)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
 {
-  __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, 0);
+  __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, v8i_zeros, 0);
 }
 
 // CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d2(
 // CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 27)
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 27)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1)
 {
-  __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, 27);
+  __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 27);
 }
 
 // CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds(
 // CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 22)
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 22)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
 {
-  __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, 22);
+  __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, v8i_zeros, 22);
 }
 
 // CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d2(
 // CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 0)
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0)
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_tensor_store_from_lds_d2(v4i sg0, v8i sg1)
 {
-  __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, 0);
+  __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 0);
+}
+
+//=======================================================================
+// It is fine to pass 5 arguments as tensor descriptor, but the fifth one
+// will be ignored by llvm CodeGen for gfx1250, which only supports D# up
+// to 4 groups.
+//========================================================================
+
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d5(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0)
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_tensor_load_to_lds_d5(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4)
+{
+  __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, sg4, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d5(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0)
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_tensor_store_from_lds_d5(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4)
+{
+  __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, 0);
 }
@@ -183,12 +183,10 @@ void test_amdgcn_async_load_store_lds_cpol(global char* gaddr8, global int *gadd
   __builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}
 }
 
-void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
+void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4, int cpol)
 {
-  __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
-  __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}}
-  __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
-  __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
+  __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
+  __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
 }
 
 void test_prefetch(generic void *fptr, global void *gptr, int cpol) {
 
@@ -4194,41 +4194,24 @@ def int_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm
 def int_amdgcn_swmmac_i32_16x16x128_iu8     : AMDGPUSWmmacIntrinsicABIdxClamp<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
 }
 
-
 class AMDGPUTensorLoadStore:
   Intrinsic<
     [],
     [llvm_v4i32_ty, // D# group 0
      llvm_v8i32_ty, // D# group 1
-     llvm_v4i32_ty, // D# group 2
-     llvm_v4i32_ty, // D# group 3
+     llvm_v4i32_ty, // D# group 2: group 2 and 3 should be zero-initialized for D# up to 2D.
+     llvm_v4i32_ty, // D# group 3:
+     llvm_v8i32_ty, // D# group 4: reserved for future targets, use <8 x i32> zeroinitializer for now.
+                    //   This argument will be silently ignored.
      llvm_i32_ty],  // cachepolicy:
                     //   bits [0-2] = th
                     //   bits [3-4] = scope
-    [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
-    "", [SDNPMemOperand]
-  >;
-
-class AMDGPUTensorLoadStoreD2:
-  Intrinsic<
-    [],
-    [llvm_v4i32_ty,  // D# group 0
-     llvm_v8i32_ty,  // D# group 1
-     llvm_i32_ty],   // cachepolicy:
-                     //   bits [0-2] = th
-                     //   bits [3-4] = scope
-    [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
+    [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
     "", [SDNPMemOperand]
   >;
 
-def int_amdgcn_tensor_load_to_lds :
-  ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore;
-def int_amdgcn_tensor_store_from_lds :
-  ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore;
-def int_amdgcn_tensor_load_to_lds_d2 :
-  ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds_d2">, AMDGPUTensorLoadStoreD2;
-def int_amdgcn_tensor_store_from_lds_d2 :
-  ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2;
+def int_amdgcn_tensor_load_to_lds    : ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore;
+def int_amdgcn_tensor_store_from_lds : ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore;
 
 class AMDGPUClusterLoad<LLVMType ptr_ty>:
   Intrinsic<
 
@@ -3005,6 +3005,37 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
 }
 
+void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
+  bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
+  unsigned Opc =
+      IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : AMDGPU::TENSOR_STORE_FROM_LDS;
+
+  SmallVector<SDValue, 7> TensorOps;
+  // First two groups
+  TensorOps.push_back(N->getOperand(2)); // D# group 0
+  TensorOps.push_back(N->getOperand(3)); // D# group 1
+
+  // Use _D2 version if both group 2 and 3 are zero-initialized.
+  SDValue Group2 = N->getOperand(4);
+  SDValue Group3 = N->getOperand(5);
+  if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&
+      ISD::isBuildVectorAllZeros(Group3.getNode())) {
+    Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2
+                 : AMDGPU::TENSOR_STORE_FROM_LDS_D2;
+  } else {                       // Has at least 4 groups
+    TensorOps.push_back(Group2); // D# group 2
+    TensorOps.push_back(Group3); // D# group 3
+  }
+
+  // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
+  // for now because all existing targets only support up to 4 groups.
+  TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128
+  TensorOps.push_back(N->getOperand(7)); // cache policy
+  TensorOps.push_back(N->getOperand(0)); // chain
+
+  (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);
+}
+
 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
   switch (IntrID) {
   case Intrinsic::amdgcn_ds_gws_init:
@@ -3287,6 +3318,10 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
   case Intrinsic::amdgcn_ds_gws_sema_release_all:
     SelectDS_GWS(N, IntrID);
     return;
+  case Intrinsic::amdgcn_tensor_load_to_lds:
+  case Intrinsic::amdgcn_tensor_store_from_lds:
+    SelectTensorLoadStore(N, IntrID);
+    return;
   default:
     break;
   }
 
@@ -285,6 +285,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   void SelectFP_EXTEND(SDNode *N);
   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
   void SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID);
+  void SelectTensorLoadStore(SDNode *N, unsigned IntrID);
   void SelectDS_GWS(SDNode *N, unsigned IntrID);
   void SelectInterpP1F16(SDNode *N);
   void SelectINTRINSIC_W_CHAIN(SDNode *N);
 
@@ -1826,27 +1826,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     NewII->takeName(&II);
     return IC.replaceInstUsesWith(II, NewII);
   }
-  case Intrinsic::amdgcn_tensor_load_to_lds:
-  case Intrinsic::amdgcn_tensor_store_from_lds: {
-    Value *D2 = II.getArgOperand(2);
-    Value *D3 = II.getArgOperand(3);
-    // We know that not passing the second and third tensor DMA groups is
-    // equivalent to passing zeroes for those registers, so we rewrite to the
-    // shorter form here. Undef or poison are replaced by 0.
-    auto Pred = m_CombineOr(m_Zero(), m_Undef());
-    if (!match(D2, Pred) || !match(D3, Pred))
-      return std::nullopt;
-
-    auto ShortIntrinsic = IID == Intrinsic::amdgcn_tensor_load_to_lds
-                              ? Intrinsic::amdgcn_tensor_load_to_lds_d2
-                              : Intrinsic::amdgcn_tensor_store_from_lds_d2;
-    CallInst *NewII = IC.Builder.CreateIntrinsic(
-        ShortIntrinsic,
-        {II.getArgOperand(0), II.getArgOperand(1), II.getArgOperand(4)});
-    NewII->takeName(&II);
-    NewII->copyMetadata(II);
-    return IC.eraseInstFromFunction(II);
-  }
   case Intrinsic::amdgcn_wave_shuffle: {
     if (!ST->hasDPP())
       return std::nullopt;
 
@@ -2393,6 +2393,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
   case Intrinsic::amdgcn_global_load_lds:
   case Intrinsic::amdgcn_global_load_async_lds:
     return selectGlobalLoadLds(I);
+  case Intrinsic::amdgcn_tensor_load_to_lds:
+  case Intrinsic::amdgcn_tensor_store_from_lds:
+    return selectTensorLoadStore(I, IntrinsicID);
   case Intrinsic::amdgcn_asyncmark:
   case Intrinsic::amdgcn_wait_asyncmark:
     // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
@@ -3787,6 +3790,47 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
+                                                      Intrinsic::ID IID) const {
+  bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
+  unsigned Opc =
+      IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : AMDGPU::TENSOR_STORE_FROM_LDS;
+  int NumGroups = 4;
+
+  // A lamda function to check whether an operand is a vector of all 0s.
+  const auto isAllZeros = [&](MachineOperand &Opnd) {
+    const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
+    if (!DefMI)
+      return false;
+    return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
+  };
+
+  // Use _D2 version if both group 2 and 3 are zero-initialized.
+  if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
+    NumGroups = 2;
+    Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2
+                 : AMDGPU::TENSOR_STORE_FROM_LDS_D2;
+  }
+
+  // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
+  // for now because all existing targets only support up to 4 groups.
+  MachineBasicBlock *MBB = MI.getParent();
+  auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
+                 .add(MI.getOperand(1))  // D# group 0
+                 .add(MI.getOperand(2)); // D# group 1
+
+  if (NumGroups >= 4) {         // Has at least 4 groups
+    MIB.add(MI.getOperand(3))   // D# group 2
+        .add(MI.getOperand(4)); // D# group 3
+  }
+
+  MIB.addImm(0)               // r128
+      .add(MI.getOperand(6)); // cpol
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
     MachineInstr &MI) const {
   unsigned OpcodeOpIdx =
 
@@ -145,6 +145,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
   bool selectBufferLoadLds(MachineInstr &MI) const;
   bool selectGlobalLoadLds(MachineInstr &MI) const;
+  bool selectTensorLoadStore(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectBVHIntersectRayIntrinsic(MachineInstr &I) const;
   bool selectSMFMACIntrin(MachineInstr &I) const;
   bool selectPermlaneSwapIntrin(MachineInstr &I, Intrinsic::ID IntrID) const;
 
@@ -3388,12 +3388,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(B, MI, 2);
       constrainOpWithReadfirstlane(B, MI, 3);
       constrainOpWithReadfirstlane(B, MI, 4);
-      return;
-    }
-    case Intrinsic::amdgcn_tensor_load_to_lds_d2:
-    case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
-      constrainOpWithReadfirstlane(B, MI, 1);
-      constrainOpWithReadfirstlane(B, MI, 2);
+      constrainOpWithReadfirstlane(B, MI, 5);
       return;
     }
     default: {
@@ -5636,8 +5631,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     case Intrinsic::amdgcn_pops_exiting_wave_id:
       return getDefaultMappingSOP(MI);
-    case Intrinsic::amdgcn_tensor_load_to_lds_d2:
-    case Intrinsic::amdgcn_tensor_store_from_lds_d2:
     case Intrinsic::amdgcn_tensor_load_to_lds:
     case Intrinsic::amdgcn_tensor_store_from_lds: {
       // Lie and claim everything is legal, even all operands need to be
Original file line number	Diff line number	Diff line change
`@@ -183,12 +183,10 @@ void test_amdgcn_async_load_store_lds_cpol(global char* gaddr8, global int *gadd`
`183`	`183`	`__builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}`
`184`	`184`	`}`
`185`	`185`
`186`		`-void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)`
	`186`	`+void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4, int cpol)`
`187`	`187`	`{`
`188`		`- __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}`
`189`		`- __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}}`
`190`		`- __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}`
`191`		`- __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}`
	`188`	`+ __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}`
	`189`	`+ __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}`
`192`	`190`	`}`
`193`	`191`
`194`	`192`	`void test_prefetch(generic void fptr, global void gptr, int cpol) {`