Skip to content

Commit 99dc561

Browse files
authored
[AMDGPU] Use a general form of intrinsic for tensor load/store (#182334)
The intrinsic has five arguments for the tensor descriptor (D#), while the fifth one is reserved for future targets, and it will be silently ignored in codegen for gfx1250. For tensor up to 2D, only the first two D# groups are meaningful and the rest should be zero-initialized.
1 parent 689ecf8 commit 99dc561

File tree

19 files changed

+211
-389
lines changed

19 files changed

+211
-389
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.td

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -752,10 +752,9 @@ def __builtin_amdgcn_global_store_async_from_lds_b128 : AMDGPUBuiltin<"void(_Ext
752752
def __builtin_amdgcn_ds_atomic_async_barrier_arrive_b64 : AMDGPUBuiltin<"void(long int address_space<3> *)", [Const], "gfx1250-insts">;
753753
def __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64 : AMDGPUBuiltin<"long int(long int address_space<3> *, long int)", [Const], "gfx1250-insts">;
754754

755-
def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _Constant int)", [Const], "gfx1250-insts">;
756-
def __builtin_amdgcn_tensor_load_to_lds_d2 : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
757-
def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _Constant int)", [Const], "gfx1250-insts">;
758-
def __builtin_amdgcn_tensor_store_from_lds_d2 : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
755+
def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
756+
def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
757+
759758

760759
def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
761760
def __builtin_amdgcn_global_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">;

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,42 +5,72 @@
55
typedef int v4i __attribute__((ext_vector_type(4)));
66
typedef int v8i __attribute__((ext_vector_type(8)));
77

8+
static v4i v4i_zeros = (v4i){0,0,0,0};
9+
static v8i v8i_zeros = (v8i){0,0,0,0,0,0,0,0};
10+
811
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds(
912
// CHECK-GFX1250-NEXT: entry:
10-
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 0)
13+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 0)
1114
// CHECK-GFX1250-NEXT: ret void
1215
//
1316
void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
1417
{
15-
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, 0);
18+
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, v8i_zeros, 0);
1619
}
1720

1821
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d2(
1922
// CHECK-GFX1250-NEXT: entry:
20-
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 27)
23+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 27)
2124
// CHECK-GFX1250-NEXT: ret void
2225
//
2326
void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1)
2427
{
25-
__builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, 27);
28+
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 27);
2629
}
2730

2831
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds(
2932
// CHECK-GFX1250-NEXT: entry:
30-
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 22)
33+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 22)
3134
// CHECK-GFX1250-NEXT: ret void
3235
//
3336
void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
3437
{
35-
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, 22);
38+
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, v8i_zeros, 22);
3639
}
3740

3841
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d2(
3942
// CHECK-GFX1250-NEXT: entry:
40-
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 0)
43+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0)
4144
// CHECK-GFX1250-NEXT: ret void
4245
//
4346
void test_amdgcn_tensor_store_from_lds_d2(v4i sg0, v8i sg1)
4447
{
45-
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, 0);
48+
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 0);
49+
}
50+
51+
//=======================================================================
52+
// It is fine to pass 5 arguments as tensor descriptor, but the fifth one
53+
// will be ignored by llvm CodeGen for gfx1250, which only supports D# up
54+
// to 4 groups.
55+
//========================================================================
56+
57+
58+
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d5(
59+
// CHECK-GFX1250-NEXT: entry:
60+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0)
61+
// CHECK-GFX1250-NEXT: ret void
62+
//
63+
void test_amdgcn_tensor_load_to_lds_d5(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4)
64+
{
65+
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, sg4, 0);
66+
}
67+
68+
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d5(
69+
// CHECK-GFX1250-NEXT: entry:
70+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0)
71+
// CHECK-GFX1250-NEXT: ret void
72+
//
73+
void test_amdgcn_tensor_store_from_lds_d5(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4)
74+
{
75+
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, 0);
4676
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,12 +183,10 @@ void test_amdgcn_async_load_store_lds_cpol(global char* gaddr8, global int *gadd
183183
__builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}
184184
}
185185

186-
void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
186+
void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4, int cpol)
187187
{
188-
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
189-
__builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}}
190-
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
191-
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
188+
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
189+
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
192190
}
193191

194192
void test_prefetch(generic void *fptr, global void *gptr, int cpol) {

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4194,41 +4194,24 @@ def int_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm
41944194
def int_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUSWmmacIntrinsicABIdxClamp<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
41954195
}
41964196

4197-
41984197
class AMDGPUTensorLoadStore:
41994198
Intrinsic<
42004199
[],
42014200
[llvm_v4i32_ty, // D# group 0
42024201
llvm_v8i32_ty, // D# group 1
4203-
llvm_v4i32_ty, // D# group 2
4204-
llvm_v4i32_ty, // D# group 3
4202+
llvm_v4i32_ty, // D# group 2: group 2 and 3 should be zero-initialized for D# up to 2D.
4203+
llvm_v4i32_ty, // D# group 3:
4204+
llvm_v8i32_ty, // D# group 4: reserved for future targets, use <8 x i32> zeroinitializer for now.
4205+
// This argument will be silently ignored.
42054206
llvm_i32_ty], // cachepolicy:
42064207
// bits [0-2] = th
42074208
// bits [3-4] = scope
4208-
[IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
4209-
"", [SDNPMemOperand]
4210-
>;
4211-
4212-
class AMDGPUTensorLoadStoreD2:
4213-
Intrinsic<
4214-
[],
4215-
[llvm_v4i32_ty, // D# group 0
4216-
llvm_v8i32_ty, // D# group 1
4217-
llvm_i32_ty], // cachepolicy:
4218-
// bits [0-2] = th
4219-
// bits [3-4] = scope
4220-
[IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
4209+
[IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
42214210
"", [SDNPMemOperand]
42224211
>;
42234212

4224-
def int_amdgcn_tensor_load_to_lds :
4225-
ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore;
4226-
def int_amdgcn_tensor_store_from_lds :
4227-
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore;
4228-
def int_amdgcn_tensor_load_to_lds_d2 :
4229-
ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds_d2">, AMDGPUTensorLoadStoreD2;
4230-
def int_amdgcn_tensor_store_from_lds_d2 :
4231-
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2;
4213+
def int_amdgcn_tensor_load_to_lds : ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore;
4214+
def int_amdgcn_tensor_store_from_lds : ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore;
42324215

42334216
class AMDGPUClusterLoad<LLVMType ptr_ty>:
42344217
Intrinsic<

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3005,6 +3005,37 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
30053005
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
30063006
}
30073007

3008+
void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
3009+
bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
3010+
unsigned Opc =
3011+
IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : AMDGPU::TENSOR_STORE_FROM_LDS;
3012+
3013+
SmallVector<SDValue, 7> TensorOps;
3014+
// First two groups
3015+
TensorOps.push_back(N->getOperand(2)); // D# group 0
3016+
TensorOps.push_back(N->getOperand(3)); // D# group 1
3017+
3018+
// Use _D2 version if both group 2 and 3 are zero-initialized.
3019+
SDValue Group2 = N->getOperand(4);
3020+
SDValue Group3 = N->getOperand(5);
3021+
if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&
3022+
ISD::isBuildVectorAllZeros(Group3.getNode())) {
3023+
Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2
3024+
: AMDGPU::TENSOR_STORE_FROM_LDS_D2;
3025+
} else { // Has at least 4 groups
3026+
TensorOps.push_back(Group2); // D# group 2
3027+
TensorOps.push_back(Group3); // D# group 3
3028+
}
3029+
3030+
// TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
3031+
// for now because all existing targets only support up to 4 groups.
3032+
TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128
3033+
TensorOps.push_back(N->getOperand(7)); // cache policy
3034+
TensorOps.push_back(N->getOperand(0)); // chain
3035+
3036+
(void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);
3037+
}
3038+
30083039
static unsigned gwsIntrinToOpcode(unsigned IntrID) {
30093040
switch (IntrID) {
30103041
case Intrinsic::amdgcn_ds_gws_init:
@@ -3287,6 +3318,10 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
32873318
case Intrinsic::amdgcn_ds_gws_sema_release_all:
32883319
SelectDS_GWS(N, IntrID);
32893320
return;
3321+
case Intrinsic::amdgcn_tensor_load_to_lds:
3322+
case Intrinsic::amdgcn_tensor_store_from_lds:
3323+
SelectTensorLoadStore(N, IntrID);
3324+
return;
32903325
default:
32913326
break;
32923327
}

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
285285
void SelectFP_EXTEND(SDNode *N);
286286
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
287287
void SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID);
288+
void SelectTensorLoadStore(SDNode *N, unsigned IntrID);
288289
void SelectDS_GWS(SDNode *N, unsigned IntrID);
289290
void SelectInterpP1F16(SDNode *N);
290291
void SelectINTRINSIC_W_CHAIN(SDNode *N);

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1826,27 +1826,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
18261826
NewII->takeName(&II);
18271827
return IC.replaceInstUsesWith(II, NewII);
18281828
}
1829-
case Intrinsic::amdgcn_tensor_load_to_lds:
1830-
case Intrinsic::amdgcn_tensor_store_from_lds: {
1831-
Value *D2 = II.getArgOperand(2);
1832-
Value *D3 = II.getArgOperand(3);
1833-
// We know that not passing the second and third tensor DMA groups is
1834-
// equivalent to passing zeroes for those registers, so we rewrite to the
1835-
// shorter form here. Undef or poison are replaced by 0.
1836-
auto Pred = m_CombineOr(m_Zero(), m_Undef());
1837-
if (!match(D2, Pred) || !match(D3, Pred))
1838-
return std::nullopt;
1839-
1840-
auto ShortIntrinsic = IID == Intrinsic::amdgcn_tensor_load_to_lds
1841-
? Intrinsic::amdgcn_tensor_load_to_lds_d2
1842-
: Intrinsic::amdgcn_tensor_store_from_lds_d2;
1843-
CallInst *NewII = IC.Builder.CreateIntrinsic(
1844-
ShortIntrinsic,
1845-
{II.getArgOperand(0), II.getArgOperand(1), II.getArgOperand(4)});
1846-
NewII->takeName(&II);
1847-
NewII->copyMetadata(II);
1848-
return IC.eraseInstFromFunction(II);
1849-
}
18501829
case Intrinsic::amdgcn_wave_shuffle: {
18511830
if (!ST->hasDPP())
18521831
return std::nullopt;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2393,6 +2393,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
23932393
case Intrinsic::amdgcn_global_load_lds:
23942394
case Intrinsic::amdgcn_global_load_async_lds:
23952395
return selectGlobalLoadLds(I);
2396+
case Intrinsic::amdgcn_tensor_load_to_lds:
2397+
case Intrinsic::amdgcn_tensor_store_from_lds:
2398+
return selectTensorLoadStore(I, IntrinsicID);
23962399
case Intrinsic::amdgcn_asyncmark:
23972400
case Intrinsic::amdgcn_wait_asyncmark:
23982401
// FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
@@ -3787,6 +3790,47 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
37873790
return true;
37883791
}
37893792

3793+
bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3794+
Intrinsic::ID IID) const {
3795+
bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3796+
unsigned Opc =
3797+
IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : AMDGPU::TENSOR_STORE_FROM_LDS;
3798+
int NumGroups = 4;
3799+
3800+
// A lamda function to check whether an operand is a vector of all 0s.
3801+
const auto isAllZeros = [&](MachineOperand &Opnd) {
3802+
const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
3803+
if (!DefMI)
3804+
return false;
3805+
return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
3806+
};
3807+
3808+
// Use _D2 version if both group 2 and 3 are zero-initialized.
3809+
if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
3810+
NumGroups = 2;
3811+
Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2
3812+
: AMDGPU::TENSOR_STORE_FROM_LDS_D2;
3813+
}
3814+
3815+
// TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3816+
// for now because all existing targets only support up to 4 groups.
3817+
MachineBasicBlock *MBB = MI.getParent();
3818+
auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
3819+
.add(MI.getOperand(1)) // D# group 0
3820+
.add(MI.getOperand(2)); // D# group 1
3821+
3822+
if (NumGroups >= 4) { // Has at least 4 groups
3823+
MIB.add(MI.getOperand(3)) // D# group 2
3824+
.add(MI.getOperand(4)); // D# group 3
3825+
}
3826+
3827+
MIB.addImm(0) // r128
3828+
.add(MI.getOperand(6)); // cpol
3829+
3830+
MI.eraseFromParent();
3831+
return true;
3832+
}
3833+
37903834
bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
37913835
MachineInstr &MI) const {
37923836
unsigned OpcodeOpIdx =

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
145145
bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
146146
bool selectBufferLoadLds(MachineInstr &MI) const;
147147
bool selectGlobalLoadLds(MachineInstr &MI) const;
148+
bool selectTensorLoadStore(MachineInstr &MI, Intrinsic::ID IID) const;
148149
bool selectBVHIntersectRayIntrinsic(MachineInstr &I) const;
149150
bool selectSMFMACIntrin(MachineInstr &I) const;
150151
bool selectPermlaneSwapIntrin(MachineInstr &I, Intrinsic::ID IntrID) const;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3388,12 +3388,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
33883388
constrainOpWithReadfirstlane(B, MI, 2);
33893389
constrainOpWithReadfirstlane(B, MI, 3);
33903390
constrainOpWithReadfirstlane(B, MI, 4);
3391-
return;
3392-
}
3393-
case Intrinsic::amdgcn_tensor_load_to_lds_d2:
3394-
case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
3395-
constrainOpWithReadfirstlane(B, MI, 1);
3396-
constrainOpWithReadfirstlane(B, MI, 2);
3391+
constrainOpWithReadfirstlane(B, MI, 5);
33973392
return;
33983393
}
33993394
default: {
@@ -5636,8 +5631,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
56365631
}
56375632
case Intrinsic::amdgcn_pops_exiting_wave_id:
56385633
return getDefaultMappingSOP(MI);
5639-
case Intrinsic::amdgcn_tensor_load_to_lds_d2:
5640-
case Intrinsic::amdgcn_tensor_store_from_lds_d2:
56415634
case Intrinsic::amdgcn_tensor_load_to_lds:
56425635
case Intrinsic::amdgcn_tensor_store_from_lds: {
56435636
// Lie and claim everything is legal, even all operands need to be

0 commit comments

Comments
 (0)