-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AMDGPU] Generalize global.load.lds to buffer fat pointers #134911
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Direct load to LDS can also be implemented on buffer fat pointers, using the pointer as the offset to raw.buffer.ptr.load.lds. This commit generalizes the existing intrinsic to support this usage.
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-llvm-ir Author: Krzysztof Drewniak (krzysz00) ChangesDirect load to LDS can also be implemented on buffer fat pointers, using the pointer as the offset to raw.buffer.ptr.load.lds. This commit generalizes the existing intrinsic to support this usage. Full diff: https://github.com/llvm/llvm-project/pull/134911.diff 3 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 217e43fcce4fd..fc6dac5dc99fc 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2624,17 +2624,20 @@ def int_amdgcn_perm :
// GFX9 Intrinsics
//===----------------------------------------------------------------------===//
+// Intrinsic for loading data from a global-memory pointer to LDS
+// Also supports buffer fat pointers.
class AMDGPUGlobalLoadLDS :
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
Intrinsic <
[],
- [LLVMQualPointerType<1>, // Base global pointer to load from
- LLVMQualPointerType<3>, // LDS base pointer to store to
+ [llvm_anyptr_ty, // Global or buffer fat pointer to load from (per-lane)
+ LLVMQualPointerType<3>, // LDS base pointer to store to (uniform)
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty, // imm offset (applied to both global and LDS address)
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
// bit 1 = sc1,
// bit 4 = scc))
+ // See raw_ptr_buffer_load_lds for semantics on ptr addrspace(7)
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 766a4ea250942..f8b3c122d75ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
case Intrinsic::memset:
case Intrinsic::memset_inline:
case Intrinsic::experimental_memset_pattern:
+ case Intrinsic::amdgcn_global_load_lds:
return true;
}
}
@@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
SplitUsers.insert(&I);
return {NewRsrc, Off};
}
+ case Intrinsic::amdgcn_global_load_lds: {
+ Value *Ptr = I.getArgOperand(0);
+ if (!isSplitFatPtr(Ptr->getType()))
+ return {nullptr, nullptr};
+ IRB.SetInsertPoint(&I);
+ auto [Rsrc, Off] = getPtrParts(Ptr);
+ Value *LDSPtr = I.getArgOperand(1);
+ Value *LoadSize = I.getArgOperand(2);
+ Value *ImmOff = I.getArgOperand(3);
+ Value *Aux = I.getArgOperand(4);
+ Value *SOffset = IRB.getInt32(0);
+ Instruction *NewLoad = IRB.CreateIntrinsic(
+ Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
+ {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
+ copyMetadata(NewLoad, &I);
+ SplitUsers.insert(&I);
+ I.replaceAllUsesWith(NewLoad);
+ return {nullptr, nullptr};
+ }
}
return {nullptr, nullptr};
}
@@ -2291,7 +2311,10 @@ class AMDGPULowerBufferFatPointers : public ModulePass {
public:
static char ID;
- AMDGPULowerBufferFatPointers() : ModulePass(ID) {}
+ AMDGPULowerBufferFatPointers() : ModulePass(ID) {
+ initializeAMDGPULowerBufferFatPointersPass(
+ *PassRegistry::getPassRegistry());
+ }
bool run(Module &M, const TargetMachine &TM);
bool runOnModule(Module &M) override;
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
index ee51b0b84554e..75175955b313f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
@@ -1724,3 +1724,21 @@ define void @memset_pattern_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %leng
call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 %length, i1 false)
ret void
}
+
+;;; Buffer load to LDS
+
+declare void @llvm.amdgcn.global.load.lds.p7(ptr addrspace(7), ptr addrspace(3), i32 immarg, i32 immarg, i32 immarg)
+
+define void @llvm_amdgcn_global_load_lds(ptr addrspace(7) inreg %p, ptr addrspace(3) inreg %l, i32 %idx) {;
+; CHECK-LABEL: define void @llvm_amdgcn_global_load_lds(
+; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[P:%.*]], ptr addrspace(3) inreg [[L:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0
+; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1
+; CHECK-NEXT: [[Q:%.*]] = add i32 [[P_OFF]], [[IDX]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[P_RSRC]], ptr addrspace(3) [[L]], i32 4, i32 [[Q]], i32 0, i32 16, i32 0)
+; CHECK-NEXT: ret void
+;
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %idx
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(7) %q, ptr addrspace(3) %l, i32 4, i32 16, i32 0)
+ ret void
+}
|
@llvm/pr-subscribers-backend-amdgpu Author: Krzysztof Drewniak (krzysz00) ChangesDirect load to LDS can also be implemented on buffer fat pointers, using the pointer as the offset to raw.buffer.ptr.load.lds. This commit generalizes the existing intrinsic to support this usage. Full diff: https://github.com/llvm/llvm-project/pull/134911.diff 3 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 217e43fcce4fd..fc6dac5dc99fc 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2624,17 +2624,20 @@ def int_amdgcn_perm :
// GFX9 Intrinsics
//===----------------------------------------------------------------------===//
+// Intrinsic for loading data from a global-memory pointer to LDS
+// Also supports buffer fat pointers.
class AMDGPUGlobalLoadLDS :
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
Intrinsic <
[],
- [LLVMQualPointerType<1>, // Base global pointer to load from
- LLVMQualPointerType<3>, // LDS base pointer to store to
+ [llvm_anyptr_ty, // Global or buffer fat pointer to load from (per-lane)
+ LLVMQualPointerType<3>, // LDS base pointer to store to (uniform)
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty, // imm offset (applied to both global and LDS address)
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
// bit 1 = sc1,
// bit 4 = scc))
+ // See raw_ptr_buffer_load_lds for semantics on ptr addrspace(7)
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 766a4ea250942..f8b3c122d75ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
case Intrinsic::memset:
case Intrinsic::memset_inline:
case Intrinsic::experimental_memset_pattern:
+ case Intrinsic::amdgcn_global_load_lds:
return true;
}
}
@@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
SplitUsers.insert(&I);
return {NewRsrc, Off};
}
+ case Intrinsic::amdgcn_global_load_lds: {
+ Value *Ptr = I.getArgOperand(0);
+ if (!isSplitFatPtr(Ptr->getType()))
+ return {nullptr, nullptr};
+ IRB.SetInsertPoint(&I);
+ auto [Rsrc, Off] = getPtrParts(Ptr);
+ Value *LDSPtr = I.getArgOperand(1);
+ Value *LoadSize = I.getArgOperand(2);
+ Value *ImmOff = I.getArgOperand(3);
+ Value *Aux = I.getArgOperand(4);
+ Value *SOffset = IRB.getInt32(0);
+ Instruction *NewLoad = IRB.CreateIntrinsic(
+ Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
+ {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
+ copyMetadata(NewLoad, &I);
+ SplitUsers.insert(&I);
+ I.replaceAllUsesWith(NewLoad);
+ return {nullptr, nullptr};
+ }
}
return {nullptr, nullptr};
}
@@ -2291,7 +2311,10 @@ class AMDGPULowerBufferFatPointers : public ModulePass {
public:
static char ID;
- AMDGPULowerBufferFatPointers() : ModulePass(ID) {}
+ AMDGPULowerBufferFatPointers() : ModulePass(ID) {
+ initializeAMDGPULowerBufferFatPointersPass(
+ *PassRegistry::getPassRegistry());
+ }
bool run(Module &M, const TargetMachine &TM);
bool runOnModule(Module &M) override;
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
index ee51b0b84554e..75175955b313f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
@@ -1724,3 +1724,21 @@ define void @memset_pattern_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %leng
call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 %length, i1 false)
ret void
}
+
+;;; Buffer load to LDS
+
+declare void @llvm.amdgcn.global.load.lds.p7(ptr addrspace(7), ptr addrspace(3), i32 immarg, i32 immarg, i32 immarg)
+
+define void @llvm_amdgcn_global_load_lds(ptr addrspace(7) inreg %p, ptr addrspace(3) inreg %l, i32 %idx) {;
+; CHECK-LABEL: define void @llvm_amdgcn_global_load_lds(
+; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[P:%.*]], ptr addrspace(3) inreg [[L:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0
+; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1
+; CHECK-NEXT: [[Q:%.*]] = add i32 [[P_OFF]], [[IDX]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[P_RSRC]], ptr addrspace(3) [[L]], i32 4, i32 [[Q]], i32 0, i32 16, i32 0)
+; CHECK-NEXT: ret void
+;
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %idx
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(7) %q, ptr addrspace(3) %l, i32 4, i32 16, i32 0)
+ ret void
+}
|
Also fix MLIR to represent immargs properly
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think that the global_* intrinsic should be forced into covering buffers. It breaks the 1:1 name mapping to the instruction, the global instruction is not a buffer
Ok, so
So ... what am I supposed to do here? Is the call for a |
Mostly this, but I also dont' really understand why this doesn't fit into the existing raw_buffer_load_lds |
However
Please note that the buffer.ptr version takes a voffset argument which is a part of the buffer fat pointer You have to lower the fat pointer LDS load to extract the voffset argument |
(I'll take name suggestions on the ptr addrspace(7) intrinsic) |
That is to say, because the offset you're gathering from is on the buffer fat pointer, the addrspace(7) version of this load has the same function signature as |
Direct load to LDS can also be implemented on buffer fat pointers, using the pointer as the offset to raw.buffer.ptr.load.lds. This commit generalizes the existing intrinsic to support this usage.
This also updates MLIR and Clang to account for the newly-overloaded type, and fixes the MLIR definition of the intrinsic to use attributes for immargs since I'm here anyway