-
Notifications
You must be signed in to change notification settings - Fork 15.7k
[MLIR] Propagate known cluster sizes from gpu.launch to gpu.func #174404
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-mlir-gpu @llvm/pr-subscribers-mlir Author: Adam Paszke (apaszke) ChangesThis lets us properly annotate ranges for gpu.cluster_block_id and gpu.cluster_dim_blocks. It also allows us to fill in the nvvm.cluster_dim attribute for use in the NVVM backend. Full diff: https://github.com/llvm/llvm-project/pull/174404.diff 12 Files Affected:
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index 2c29bb8a01a41..f0086158fb9b6 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -61,10 +61,10 @@ def GPU_Dialect : Dialect {
/// attribute with value 'workgroup`.
static bool isWorkgroupMemoryAddressSpace(Attribute memorySpace);
}];
-
let discardableAttrs = (ins
"::mlir::DenseI32ArrayAttr":$known_block_size,
- "::mlir::DenseI32ArrayAttr":$known_grid_size
+ "::mlir::DenseI32ArrayAttr":$known_grid_size,
+ "::mlir::DenseI32ArrayAttr":$known_cluster_size
);
let dependentDialects = ["arith::ArithDialect"];
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 4884541a60535..e8c23200547d6 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -431,7 +431,8 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
OptionalAttr<DictArrayAttr>:$workgroup_attrib_attrs,
OptionalAttr<DictArrayAttr>:$private_attrib_attrs,
GPU_OptionalDimSizeHintAttr:$known_block_size,
- GPU_OptionalDimSizeHintAttr:$known_grid_size);
+ GPU_OptionalDimSizeHintAttr:$known_grid_size,
+ GPU_OptionalDimSizeHintAttr:$known_cluster_size);
let regions = (region AnyRegion:$body);
let skipDefaultBuilders = 1;
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index eb662a1b056de..52dbeea829594 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -186,7 +186,8 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
attr.getName() == gpuFuncOp.getWorkgroupAttribAttrsAttrName() ||
attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName() ||
attr.getName() == gpuFuncOp.getKnownBlockSizeAttrName() ||
- attr.getName() == gpuFuncOp.getKnownGridSizeAttrName())
+ attr.getName() == gpuFuncOp.getKnownGridSizeAttrName() ||
+ attr.getName() == gpuFuncOp.getKnownClusterSizeAttrName())
continue;
if (attr.getName() == gpuFuncOp.getArgAttrsAttrName()) {
argAttrs = gpuFuncOp.getArgAttrsAttr();
@@ -197,6 +198,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
DenseI32ArrayAttr knownBlockSize = gpuFuncOp.getKnownBlockSizeAttr();
DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
+ DenseI32ArrayAttr knownClusterSize = gpuFuncOp.getKnownClusterSizeAttr();
// Ensure we don't lose information if the function is lowered before its
// surrounding context.
auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
@@ -206,6 +208,9 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
if (knownGridSize)
attributes.emplace_back(gpuDialect->getKnownGridSizeAttrHelper().getName(),
knownGridSize);
+ if (knownClusterSize)
+ attributes.emplace_back(gpuDialect->getKnownClusterSizeAttrHelper().getName(),
+ knownClusterSize);
// Add a dialect specific kernel attribute in addition to GPU kernel
// attribute. The former is necessary for further translation while the
@@ -217,6 +222,10 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
if (kernelBlockSizeAttributeName && knownBlockSize) {
attributes.emplace_back(kernelBlockSizeAttributeName, knownBlockSize);
}
+ // Set the dialect-specific cluster size attribute if there is one.
+ if (kernelClusterSizeAttributeName && knownClusterSize) {
+ attributes.emplace_back(kernelClusterSizeAttributeName, knownClusterSize);
+ }
}
LLVM::CConv callingConvention = gpuFuncOp.isKernel()
? kernelCallingConvention
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index ec74787b2a8ed..47094e91e4dcc 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -73,6 +73,10 @@ struct GPUFuncOpLoweringOptions {
/// The attribute name to to set block size. Null if no attribute should be
/// used.
StringAttr kernelBlockSizeAttributeName;
+ /// The attribute name to to set cluster size. Null if no attribute should be
+ /// used.
+ StringAttr kernelClusterSizeAttributeName;
+
/// The calling convention to use for kernel functions.
LLVM::CConv kernelCallingConvention = LLVM::CConv::C;
@@ -93,6 +97,7 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
workgroupAddrSpace(options.workgroupAddrSpace),
kernelAttributeName(options.kernelAttributeName),
kernelBlockSizeAttributeName(options.kernelBlockSizeAttributeName),
+ kernelClusterSizeAttributeName(options.kernelClusterSizeAttributeName),
kernelCallingConvention(options.kernelCallingConvention),
nonKernelCallingConvention(options.nonKernelCallingConvention),
encodeWorkgroupAttributionsAsArguments(
@@ -114,6 +119,10 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
/// The attribute name to to set block size. Null if no attribute should be
/// used.
StringAttr kernelBlockSizeAttributeName;
+ /// The attribute name to to set cluster size. Null if no attribute should be
+ /// used.
+ StringAttr kernelClusterSizeAttributeName;
+
/// The calling convention to use for kernel functions
LLVM::CConv kernelCallingConvention;
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index 91c43e8bd1117..ae0239132e7d0 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -17,7 +17,7 @@
namespace mlir {
namespace gpu {
namespace index_lowering {
-enum class IndexKind : uint32_t { Other = 0, Block = 1, Grid = 2 };
+enum class IndexKind : uint32_t { Other = 0, Block = 1, Grid = 2, Cluster = 3 };
enum class IntrType : uint32_t {
None = 0,
Id = 1,
@@ -92,6 +92,13 @@ struct OpLowering : public ConvertOpToLLVMPattern<Op> {
funcBounds = gridHelper.getAttr(funcOp);
break;
}
+ case IndexKind::Cluster: {
+ auto clusterHelper =
+ gpu::GPUDialect::KnownClusterSizeAttrHelper(op.getContext());
+ if (clusterHelper.isAttrPresent(funcOp))
+ funcBounds = clusterHelper.getAttr(funcOp);
+ break;
+ }
case IndexKind::Other:
break;
}
@@ -104,6 +111,9 @@ struct OpLowering : public ConvertOpToLLVMPattern<Op> {
case IndexKind::Grid:
funcBounds = gpuFunc.getKnownGridSizeAttr();
break;
+ case IndexKind::Cluster:
+ funcBounds = gpuFunc.getKnownClusterSizeAttr();
+ break;
case IndexKind::Other:
break;
}
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index c0480a1dfb512..dd05c913347ee 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -533,6 +533,7 @@ void populateGpuToLLVMSPVConversionPatterns(
GPUFuncOpLoweringOptions{
privateAddressSpace, localAddressSpace,
/*kernelAttributeName=*/{}, kernelBlockSizeAttributeName,
+ /*kernelClusterSizeAttributeName=*/{},
LLVM::CConv::SPIR_KERNEL, LLVM::CConv::SPIR_FUNC,
/*encodeWorkgroupAttributionsAsArguments=*/true});
}
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 2561ca00d4b4f..6394296e99b9e 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -708,11 +708,11 @@ void mlir::populateGpuToNVVMConversionPatterns(
patterns.add<gpu::index_lowering::OpLowering<
gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>(
- converter, IndexKind::Other, IntrType::Id, benefit);
+ converter, IndexKind::Cluster, IntrType::Id, benefit);
patterns.add<gpu::index_lowering::OpLowering<
gpu::ClusterDimBlocksOp, NVVM::ClusterDimBlocksXOp,
NVVM::ClusterDimBlocksYOp, NVVM::ClusterDimBlocksZOp>>(
- converter, IndexKind::Other, IntrType::Dim, benefit);
+ converter, IndexKind::Cluster, IntrType::Dim, benefit);
patterns.add<gpu::index_lowering::OpLowering<
gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>(
converter, IndexKind::Grid, IntrType::Id, benefit);
@@ -737,7 +737,9 @@ void mlir::populateGpuToNVVMConversionPatterns(
StringAttr::get(&converter.getContext(),
NVVM::NVVMDialect::getKernelFuncAttrName()),
StringAttr::get(&converter.getContext(),
- NVVM::NVVMDialect::getMaxntidAttrName())},
+ NVVM::NVVMDialect::getMaxntidAttrName()),
+ StringAttr::get(&converter.getContext(),
+ NVVM::NVVMDialect::getClusterDimAttrName())},
benefit);
populateLibDeviceConversionPatterns(converter, patterns, benefit);
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 51741414d2060..b8eb6d7facc6d 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -458,7 +458,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
/*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
/*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
rocdlDialect->getKernelAttrHelper().getName(),
- rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
+ rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName(),
+ /*kernelClusterSizeAttributeName=*/{}});
if (Runtime::HIP == runtime) {
patterns.add<GPUPrintfOpToHIPLowering>(converter);
} else if (Runtime::OpenCL == runtime) {
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 21c0d369b8d1c..36db6e82baaea 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -396,6 +396,8 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
return verifyKnownLaunchSizeAttr(op, attr);
if (attr.getName() == getKnownGridSizeAttrHelper().getName())
return verifyKnownLaunchSizeAttr(op, attr);
+ if (attr.getName() == getKnownClusterSizeAttrHelper().getName())
+ return verifyKnownLaunchSizeAttr(op, attr);
if (!llvm::isa<UnitAttr>(attr.getValue()) ||
attr.getName() != getContainerModuleAttrName())
return success();
diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index bee3f392c91b5..263fcb96c17db 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -30,7 +30,7 @@ static ConstantIntRanges getIndexRange(uint64_t umin, uint64_t umax) {
}
namespace {
-enum class LaunchDims : uint32_t { Block = 0, Grid = 1 };
+enum class LaunchDims : uint32_t { Block = 0, Grid = 1, Cluster = 2 };
} // end namespace
/// If the operation `op` is in a context that is annotated with maximum
@@ -63,6 +63,9 @@ getKnownLaunchAttr(GPUFuncOp func, LaunchDims dims, Dimension dim) {
case LaunchDims::Grid:
bounds = func.getKnownGridSizeAttr();
break;
+ case LaunchDims::Cluster:
+ bounds = func.getKnownClusterSizeAttr();
+ break;
}
if (!bounds)
return std::nullopt;
@@ -94,6 +97,13 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
case LaunchDims::Grid:
bounds = launch.getGridSizeOperandValues();
break;
+ case LaunchDims::Cluster:
+ if (launch.hasClusterSize()) {
+ auto clusterBounds = launch.getClusterSizeOperandValues();
+ if (clusterBounds)
+ bounds = *clusterBounds;
+ }
+ break;
}
Value maybeBound = valueByDim(bounds, dim);
APInt value;
@@ -115,6 +125,9 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
case LaunchDims::Grid:
attrName = GPUDialect::KnownGridSizeAttrHelper::getNameStr();
break;
+ case LaunchDims::Cluster:
+ attrName = GPUDialect::KnownClusterSizeAttrHelper::getNameStr();
+ break;
}
auto discardableAttr = getKnownLaunchAttr(func, attrName, dim);
if (discardableAttr)
@@ -133,6 +146,9 @@ void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void ClusterDimBlocksOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
+ if (auto known = getKnownLaunchDim(*this, LaunchDims::Cluster))
+ return setResultRange(getResult(), getIndexRange(*known, *known));
+
uint64_t max = kMaxClusterDim;
if (auto specified = getUpperBound())
max = specified->getZExtValue();
@@ -150,6 +166,8 @@ void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void ClusterBlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
uint64_t max = kMaxClusterDim;
+ if (auto known = getKnownLaunchDim(*this, LaunchDims::Cluster))
+ max = *known;
if (auto specified = getUpperBound())
max = specified->getZExtValue();
setResultRange(getResult(), getIndexRange(0, max - 1ULL));
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index f1cc1eb983267..55ee508aa9f55 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1149,3 +1149,45 @@ gpu.module @test_module_56 {
func.return %sin16, %cos16, %sin32, %cos32, %sin64, %cos64 : f16, f16, f32, f32, f64, f64
}
}
+
+// -----
+
+gpu.module @test_module_cluster_size {
+ // CHECK-LABEL: llvm.func @kernel_with_cluster_size()
+ // CHECK-SAME: nvvm.cluster_dim = array<i32: 8, 2, 4>
+ gpu.func @kernel_with_cluster_size() kernel attributes {known_cluster_size = array<i32: 8, 2, 4>} {
+ gpu.return
+ }
+}
+
+// -----
+
+gpu.module @test_module_cluster_block_ops {
+// CHECK-LABEL: llvm.func @kernel_with_cluster_size(
+// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr)
+// CHECK-SAME: gpu.known_cluster_size = array<i32: 8, 4, 2>
+ gpu.func @kernel_with_cluster_size(%arg0: !llvm.ptr) kernel attributes {known_cluster_size = array<i32: 8, 4, 2>} {
+ // CHECK: nvvm.read.ptx.sreg.cluster.ctaid.x range <i32, 0, 8> : i32
+ %0 = gpu.cluster_block_id x
+ // CHECK: nvvm.read.ptx.sreg.cluster.ctaid.y range <i32, 0, 4> : i32
+ %1 = gpu.cluster_block_id y
+ // CHECK: nvvm.read.ptx.sreg.cluster.ctaid.z range <i32, 0, 2> : i32
+ %2 = gpu.cluster_block_id z
+ // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.x range <i32, 1, 9> : i32
+ %3 = gpu.cluster_dim_blocks x
+ // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.y range <i32, 1, 5> : i32
+ %4 = gpu.cluster_dim_blocks y
+ // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.z range <i32, 1, 3> : i32
+ %5 = gpu.cluster_dim_blocks z
+
+ %6 = arith.addi %0, %1 : index
+ %7 = arith.addi %6, %2 : index
+ %8 = arith.addi %7, %3 : index
+ %9 = arith.addi %8, %4 : index
+ %10 = arith.addi %9, %5 : index
+ %11 = arith.index_cast %10 : index to i64
+ llvm.store %11, %arg0 : i64, !llvm.ptr
+ gpu.return
+ }
+}
+
diff --git a/mlir/test/Dialect/GPU/int-range-interface-cluster.mlir b/mlir/test/Dialect/GPU/int-range-interface-cluster.mlir
new file mode 100644
index 0000000000000..a7dd0df2e2c13
--- /dev/null
+++ b/mlir/test/Dialect/GPU/int-range-interface-cluster.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt -int-range-optimizations %s | FileCheck %s
+
+gpu.module @test_module {
+ gpu.func @test_cluster_ranges() kernel attributes {known_cluster_size = array<i32: 8, 4, 1>} {
+ %c0 = gpu.cluster_block_id x
+ // CHECK: test.reflect_bounds {smax = 7 : index, smin = 0 : index, umax = 7 : index, umin = 0 : index}
+ %c0_0 = test.reflect_bounds %c0 : index
+ %c1 = gpu.cluster_block_id y
+ // CHECK: test.reflect_bounds {smax = 3 : index, smin = 0 : index, umax = 3 : index, umin = 0 : index}
+ %c1_0 = test.reflect_bounds %c1 : index
+ %c2 = gpu.cluster_block_id z
+ // CHECK: test.reflect_bounds {smax = 0 : index, smin = 0 : index, umax = 0 : index, umin = 0 : index}
+ %c2_0 = test.reflect_bounds %c2 : index
+
+ %d0 = gpu.cluster_dim_blocks x
+ // CHECK: test.reflect_bounds {smax = 8 : index, smin = 8 : index, umax = 8 : index, umin = 8 : index}
+ %d0_0 = test.reflect_bounds %d0 : index
+ %d1 = gpu.cluster_dim_blocks y
+ // CHECK: test.reflect_bounds {smax = 4 : index, smin = 4 : index, umax = 4 : index, umin = 4 : index}
+ %d1_0 = test.reflect_bounds %d1 : index
+ %d2 = gpu.cluster_dim_blocks z
+ // CHECK: test.reflect_bounds {smax = 1 : index, smin = 1 : index, umax = 1 : index, umin = 1 : index}
+ %d2_0 = test.reflect_bounds %d2 : index
+
+ gpu.return
+ }
+}
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
This lets us properly annotate ranges for gpu.cluster_block_id and gpu.cluster_dim_blocks. It also allows us to fill in the nvvm.cluster_dim attribute for use in the NVVM backend.
0fdb97a to
4a77a2a
Compare
This lets us properly annotate ranges for gpu.cluster_block_id and gpu.cluster_dim_blocks. It also allows us to fill in the nvvm.cluster_dim attribute for use in the NVVM backend.