Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -112,28 +112,42 @@ func.func @matmul_transpose_b(%5: tensor<64x64xf32>, %6: tensor<64x1280xf16>, %7

// -----

#config = #iree_gpu.lowering_config<{reduction = [0, 8]}>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
#map2 = affine_map<(d0, d1) -> (d0)>
func.func @reduction(%3: tensor<128x384xf32>) -> tensor<128xf32> {
#config = #iree_gpu.lowering_config<{reduction = [0, 8, 4]}>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0)>
func.func @reduction(%arg0: tensor<128x384x256xf32>) -> tensor<128xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%cst = arith.constant 0.000000e+00 : f32
%empty = tensor.empty() : tensor<128xf32>
%4 = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>
%5 = linalg.generic {
indexing_maps = [#map1, #map2],
iterator_types = ["parallel", "reduction"]
} ins(%3 : tensor<128x384xf32>) outs(%4 : tensor<128xf32>) attrs = {lowering_config = #config} {
^bb0(%in: f32, %out: f32):
%7 = arith.addf %in, %out : f32
linalg.yield %7 : f32
} -> tensor<128xf32>
return %5 : tensor<128xf32>
%init = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>

// Parent scf.for loop that will be coalesced with reduction tiling loops.
%result = scf.for %iv = %c0 to %c3 step %c1 iter_args(%arg1 = %init) -> (tensor<128xf32>) {
%slice = tensor.extract_slice %arg0[0, 0, 0] [128, 384, 256] [1, 1, 1] : tensor<128x384x256xf32> to tensor<128x384x256xf32>
%reduced = linalg.generic {
indexing_maps = [#map1, #map2],
iterator_types = ["parallel", "reduction", "reduction"]
} ins(%slice : tensor<128x384x256xf32>) outs(%arg1 : tensor<128xf32>) attrs = {lowering_config = #config} {
^bb0(%in: f32, %out: f32):
%add = arith.addf %in, %out : f32
linalg.yield %add : f32
} -> tensor<128xf32>
scf.yield %reduced : tensor<128xf32>
}
return %result : tensor<128xf32>
}

// CHECK-LABEL: func.func @reduction
// CHECK: %[[FILL:.+]] = linalg.fill {{.*}} tensor<128xf32>
// CHECK: scf.for %{{.*}} = %c0 to %c384 step %c8 iter_args(%{{.*}} = %[[FILL]])
// CHECK: linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8xf32>)
// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<128x384x256xf32>
// CHECK-DAG: %[[C9216:.+]] = arith.constant 9216 : index
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK: %[[INIT:.+]] = linalg.fill {{.*}} tensor<128xf32>
// CHECK: scf.for %{{.*}} = %[[C0]] to %[[C9216]] step %[[C1]] iter_args(%[[ARG:.+]] = %[[INIT]])
// CHECK-NOT: scf.for
// CHECK: linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8x4xf32>) outs(%[[ARG]] : tensor<128xf32>)
// CHECK: scf.yield

// Verify that no tiling happens in the thread case.
Expand All @@ -142,6 +156,47 @@ func.func @reduction(%3: tensor<128x384xf32>) -> tensor<128xf32> {

// -----

// Test that coalescing is skipped when loops have dynamic trip counts.
#config_dyn = #iree_gpu.lowering_config<{reduction = [0, 8, 4]}>
#map_dyn1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map_dyn2 = affine_map<(d0, d1, d2) -> (d0)>
func.func @reduction_dynamic_trip_count(%arg0: tensor<128x384x256xf32>, %dyn_ub: index) -> tensor<128xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f32
%empty = tensor.empty() : tensor<128xf32>
%init = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>

// Parent scf.for loop with dynamic upper bound.
// This should NOT be coalesced with reduction tiling loops.
%result = scf.for %iv = %c0 to %dyn_ub step %c1 iter_args(%arg1 = %init) -> (tensor<128xf32>) {
%slice = tensor.extract_slice %arg0[0, 0, 0] [128, 384, 256] [1, 1, 1] : tensor<128x384x256xf32> to tensor<128x384x256xf32>
%reduced = linalg.generic {
indexing_maps = [#map_dyn1, #map_dyn2],
iterator_types = ["parallel", "reduction", "reduction"]
} ins(%slice : tensor<128x384x256xf32>) outs(%arg1 : tensor<128xf32>) attrs = {lowering_config = #config_dyn} {
^bb0(%in: f32, %out: f32):
%add = arith.addf %in, %out : f32
linalg.yield %add : f32
} -> tensor<128xf32>
scf.yield %reduced : tensor<128xf32>
}
return %result : tensor<128xf32>
}

// CHECK-LABEL: func.func @reduction_dynamic_trip_count
// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<128x384x256xf32>
// CHECK-SAME: %[[DYN_UB:[A-Za-z0-9]+]]: index
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK: %[[INIT:.+]] = linalg.fill {{.*}} tensor<128xf32>
// CHECK: scf.for %{{.*}} = %[[C0]] to %[[DYN_UB]] step %[[C1]] iter_args(%[[ARG1:.+]] = %[[INIT]])
Comment thread
nirvedhmeshram marked this conversation as resolved.
// CHECK: scf.for %{{.*}} = %[[C0]] to %{{.*}} step %{{.*}} iter_args(%[[ARG2:.+]] = %[[ARG1]])
// CHECK: scf.for %{{.*}} = %[[C0]] to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.+]] = %[[ARG2]])
// CHECK: linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8x4xf32>) outs(%[[ARG3]] : tensor<128xf32>)

// -----

#config = #iree_gpu.lowering_config<{reduction = [0, 0, 8]}>
#map = affine_map<(d0, d1) -> (d0, d1)>
func.func @matmul_fuse(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: tensor<64x64xf32>) -> tensor<64x64xf32> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ func.func @conv_nhwc_generic(%a: tensor<1x3x66x8xf32>, %b: tensor<32x3x3x8xf32>,
}

// CHECK-LABEL: func.func @conv_nhwc_generic
// CHECK: scf.for %{{.*}} = %c0 to %c3 step %c1
// CHECK: scf.for %{{.*}} = %c0 to %c3 step %c1
// CHECK: linalg.generic
// CHECK-SAME: affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>
// CHECK-DAG: %[[C9:.+]] = arith.constant 9 : index
// CHECK: scf.for %{{.*}} = %c0 to %[[C9]] step %c1
// CHECK-NOT: scf.for
// CHECK: linalg.generic
// CHECK-SAME: affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>

// -----

Expand All @@ -35,10 +36,11 @@ func.func @conv_nhwc_named_dilated(%a: tensor<1x5x68x8xf32>, %b: tensor<32x3x3x8
}

// CHECK-LABEL: func.func @conv_nhwc_named_dilated
// CHECK: scf.for %{{.*}} = %c0 to %c3 step %c1
// CHECK: scf.for %{{.*}} = %c0 to %c3 step %c1
// CHECK: linalg.generic
// CHECK-SAME: affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>
// CHECK-DAG: %[[C9:.+]] = arith.constant 9 : index
// CHECK: scf.for %{{.*}} = %c0 to %[[C9]] step %c1
// CHECK-NOT: scf.for
// CHECK: linalg.generic
// CHECK-SAME: affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>

// -----

Expand All @@ -51,10 +53,11 @@ func.func @conv_nchw_named(%arg0: tensor<2x16x130x130xf32>, %arg1: tensor<32x16x
}

// CHECK-LABEL: func.func @conv_nchw_named
// CHECK: scf.for %{{.*}} = %c0 to %c3 step %c1
// CHECK: scf.for %{{.*}} = %c0 to %c3 step %c1
// CHECK: linalg.generic
// CHECK-SAME: affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d4, d2, d3)>
// CHECK-DAG: %[[C9:.+]] = arith.constant 9 : index
// CHECK: scf.for %{{.*}} = %c0 to %[[C9]] step %c1
// CHECK-NOT: scf.for
// CHECK: linalg.generic
// CHECK-SAME: affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d4, d2, d3)>

// -----

Expand All @@ -77,7 +80,8 @@ func.func @conv_chwn_generic(%a: tensor<16x24x16x16xf32>, %b: tensor<16x24x16x16
}

// CHECK-LABEL: func.func @conv_chwn_generic
// CHECK: scf.for %{{.*}} = %c0 to %c24 step %c1
// CHECK: scf.for %{{.*}} = %c0 to %c16 step %c1
// CHECK: linalg.generic
// CHECK-SAME: affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1, d2, d3)>
// CHECK-DAG: %[[C384:.+]] = arith.constant 384 : index
// CHECK: scf.for %{{.*}} = %c0 to %[[C384]] step %c1
// CHECK-NOT: scf.for
// CHECK: linalg.generic
// CHECK-SAME: affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1, d2, d3)>
50 changes: 50 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "mlir/Analysis/TopologicalSortUtils.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/Utils/Utils.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/IR/Dominance.h"

#include <cassert>
Expand Down Expand Up @@ -482,6 +485,53 @@ LogicalResult applyTileAndFuseToEachRoot(
tileAndFuseOptions, tiledResults->loops);
}
}

// Coalesce scf.for loops created during reduction tiling.
// This is done at the very end after all other transformations
// to avoid invalidating dominance info or affecting fusion logic.
if (tilingLevel == IREE::GPU::TilingLevel::Reduction &&
!tiledResults->loops.empty()) {
SmallVector<scf::ForOp> forLoops;

// Check if tiling happened inside an existing scf.for loop
// If so, include that parent loop in the coalescing.
Operation *parentOp =
tiledResults->loops.front().getOperation()->getParentOp();
scf::ForOp parentForOp = dyn_cast<scf::ForOp>(parentOp);
if (parentForOp) {
forLoops.push_back(parentForOp);
}

// Collect all the tiled loops first.
for (LoopLikeOpInterface loop : tiledResults->loops) {
if (auto forOp = dyn_cast<scf::ForOp>(loop.getOperation())) {
forLoops.push_back(forOp);
}
}

// If loops have dynamic trip counts and we coalesce them, it can
// cause range analysis to not find static bounds. This was mainly
// noticed as a problem in applyPaddingLevel, to prevent a regression
// we dont coalesce such loops.
bool hasDynamicTripCount = false;
for (scf::ForOp forOp : forLoops) {
if (!getConstantIntValue(forOp.getLowerBound()) ||
!getConstantIntValue(forOp.getUpperBound()) ||
!getConstantIntValue(forOp.getStep())) {
hasDynamicTripCount = true;
LLVM_DEBUG(llvm::dbgs()
<< "Skipping coalescing: loop has dynamic trip count\n");
break;
}
}

if (forLoops.size() > 1 && !hasDynamicTripCount) {
if (failed(coalesceLoops(rewriter, forLoops))) {
// Coalescing failure is not critical, just log and continue.
LLVM_DEBUG(llvm::dbgs() << "Failed to coalesce reduction loops\n");
}
}
}
}
return success();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,21 +62,18 @@ hal.executable private @main {
// CHECK-DAG: memref.alloc() : memref<1x1x32x68xf16, #gpu.address_space<workgroup>>
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
// CHECK-DAG: %[[C36:.+]] = arith.constant 36 : index
// CHECK-DAG: %[[C81:.+]] = arith.constant 81 : index
// CHECK: scf.forall ({{.*}}) in (16, 48, 9) {
// CHECK: scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
// CHECK: scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
// CHECK: scf.for {{.+}} = %[[C0]] to %[[C36]] step %[[C4]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
// CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<4xf16>
// CHECK-DAG: vector.transfer_write %[[LHS_RD]]
// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<8xf16>
// CHECK-DAG: vector.transfer_write %[[RHS_RD]]
// CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
// CHECK-DAG: vector.transfer_read {{.*}} vector<4x4xf16>
// CHECK-DAG: vector.transfer_read {{.*}} vector<4x4xf16>
// CHECK-COUNT-4: amdgpu.mfma 16x16x16
// CHECK: scf.for {{.+}} = %[[C0]] to %[[C81]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
// CHECK-NOT: scf.for
// CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<4xf16>
// CHECK-DAG: vector.transfer_write %[[LHS_RD]]
// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<8xf16>
// CHECK-DAG: vector.transfer_write %[[RHS_RD]]
// CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
// CHECK-DAG: vector.transfer_read {{.*}} vector<4x4xf16>
// CHECK-DAG: vector.transfer_read {{.*}} vector<4x4xf16>
// CHECK-COUNT-4: amdgpu.mfma 16x16x16
// CHECK: vector.transfer_write %{{.*}}, %[[BUF2]]
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
Original file line number Diff line number Diff line change
Expand Up @@ -1066,16 +1066,20 @@ hal.executable public @main {
}

// CHECK-LABEL: func @elemwise_reduction_elemwise
// CHECK: scf.for %{{.*}} = %{{.*}} to %c16 step %c1 {{.*}} -> (vector<1xf32>)
// CHECK: scf.for
// CHECK: scf.for
// CHECK: %[[REDUCE:.+]] = vector.multi_reduction
// CHECK: scf.yield %[[REDUCE]]

// CHECK: scf.for %{{.*}} = %{{.*}} to %c16 step %c1
// CHECK: scf.for
// CHECK-COUNT-4: arith.addf {{.*}} : vector<9xf32>
// CHECK: vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #amdgpu.address_space<fat_raw_buffer>>
// CHECK-DAG: %[[C144:.+]] = arith.constant 144 : index
// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index
// CHECK-DAG: %[[C9:.+]] = arith.constant 9 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK: scf.for %{{.*}} = %[[C0]] to %[[C144]] step %[[C1]] {{.*}} -> (vector<1xf32>)
// CHECK-NOT: scf.for
// CHECK: %[[REDUCE:.+]] = vector.multi_reduction
// CHECK: scf.yield %[[REDUCE]]

// CHECK: scf.for %{{.*}} = %[[C0]] to %[[C16]] step %[[C1]]
// CHECK: scf.for %{{.*}} = %[[C0]] to %[[C9]] step %[[C1]]
// CHECK-COUNT-4: arith.addf {{.*}} : vector<9xf32>
// CHECK: vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #amdgpu.address_space<fat_raw_buffer>>

// -----

Expand Down
Loading