Skip to content

Commit 4f0caea

Browse files
Assign correct tiles to reusable L1 buffer
1 parent d30ec7f commit 4f0caea

File tree

2 files changed

+71
-36
lines changed

2 files changed

+71
-36
lines changed

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp

+67-32
Original file line numberDiff line numberDiff line change
@@ -834,18 +834,18 @@ LogicalResult combineLogicalObjectFifos(
834834
// will make an attempt to combine the logical objectFifos as per the
835835
// following algorithm :-
836836
// a. Combine i-th and i+1-th L3->L2 DmaCpyNd ops.
837-
// b. Since step a would create a new L2 buffer (with combined shape), we
838-
// will
839-
// need to update the corresponding two L2->L1 Dma ops by indeed creating
840-
// new ones. NOTE: Both of these new L2->L1 Dma ops will be reusing the
841-
// same L1 buffers as well.
842-
// c. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1
843-
// Dma
844-
// ops and do the following :-
837+
// b. Form reusable L1 buffer by assigning the cumulative tiles of the
838+
// intended core ops.
839+
// c. Since step a would create a new L2 buffer (with combined shape), we
840+
// will need to update the corresponding two L2->L1 Dma ops by indeed
841+
// creating new ones. NOTE: Both of these new L2->L1 Dma ops will be
842+
// reusing the same L1 buffers as well.
843+
// d. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1
844+
// Dma ops and do the following :-
845845
// 1. For i-th CoreOp insert an AccessOp from the same L1 buffer towards
846-
// the end.
846+
// the end.
847847
// 2. For i+1-th CoreOp insert an AccessOp from the same L1 buffer right
848-
// before the corresponding AccessOp within the same CoreOp.
848+
// before the corresponding AccessOp within the same CoreOp.
849849
for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i += 2) {
850850
// Step 1. Combine the picked L3->L2 DmaCpyNd pair.
851851
FailureOr<LogicalObjectFifoFromMemrefOp> maybeNewL2ObjectFifo =
@@ -855,14 +855,56 @@ LogicalResult combineLogicalObjectFifos(
855855
LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
856856
maybeNewL2ObjectFifo.value();
857857

858-
// Step 2. We now have need to create two L2->L1 ops since the size has
858+
// Step 2. Form the reusable L1 buffer by assigning the cumulative tiles of
859+
// the intended core ops.
860+
LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
861+
l2ToL1DmaOps[i].getTargetObjectFifo();
862+
SmallVector<Value> tiles;
863+
auto addNewTileFrom = [&](CoreOp coreOp) -> LogicalResult {
864+
OpBuilder::InsertionGuard guard(rewriter);
865+
TileOp tileOp = coreOp.getTileOp();
866+
std::optional<int64_t> column = getConstantIntValue(tileOp.getCol());
867+
std::optional<int64_t> row = getConstantIntValue(tileOp.getRow());
868+
if (!column || !row) {
869+
return coreOp.emitOpError() << "has non-constant tile location";
870+
}
871+
rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp);
872+
auto colIndex = rewriter.create<arith::ConstantIndexOp>(
873+
rewriter.getUnknownLoc(), *column);
874+
auto rowIndex = rewriter.create<arith::ConstantIndexOp>(
875+
rewriter.getUnknownLoc(), *row);
876+
tileOp =
877+
rewriter.create<TileOp>(rewriter.getUnknownLoc(), colIndex, rowIndex);
878+
tiles.push_back(tileOp.getResult());
879+
return success();
880+
};
881+
std::optional<CoreOp> maybeFirstCoreOp = fetchUniqueCoreOp(l2ToL1DmaOps[i]);
882+
if (!maybeFirstCoreOp) return failure();
883+
CoreOp firstCoreOp = maybeFirstCoreOp.value();
884+
std::optional<CoreOp> maybeSecondCoreOp =
885+
fetchUniqueCoreOp(l2ToL1DmaOps[i + 1]);
886+
if (!maybeSecondCoreOp) return failure();
887+
CoreOp secondCoreOp = maybeSecondCoreOp.value();
888+
if (failed(addNewTileFrom(firstCoreOp)) ||
889+
failed(addNewTileFrom(secondCoreOp))) {
890+
return failure();
891+
}
892+
llvm::sort(tiles.begin(), tiles.end(),
893+
AMDAIE::TileOp::tileValueColumnAndRowComparator);
894+
rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp);
895+
reuseL1LogicalObjectFifoOp =
896+
rewriter.replaceOpWithNewOp<LogicalObjectFifoFromMemrefOp>(
897+
reuseL1LogicalObjectFifoOp,
898+
cast<LogicalObjectFifoType>(
899+
reuseL1LogicalObjectFifoOp.getOutput().getType()),
900+
reuseL1LogicalObjectFifoOp.getMemref(), tiles);
901+
902+
// Step 3. We now have need to create two L2->L1 ops since the size has
859903
// changed. But for this we first need to find the new offset for L2 as
860904
// source.
861905
// TODO: For now I'm hardcoding the offsets but later it'd just depend
862906
// on combining/non-combining dimensions.
863907
// Offset = 0,0
864-
LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
865-
l2ToL1DmaOps[i].getTargetObjectFifo();
866908
SmallVector<OpFoldResult> newL2AsSourceOffsets =
867909
l2ToL1DmaOps[i].getSourceMixedOffsets();
868910
DmaCpyNdOp newFirstL2ToL1DmaOp = createL2ToL1ForReuse(
@@ -872,31 +914,24 @@ LogicalResult combineLogicalObjectFifos(
872914
// the first L2->L1 Dma.
873915
newL2AsSourceOffsets = l2ToL1DmaOps[i + 1].getSourceMixedOffsets();
874916
newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1);
875-
DmaCpyNdOp newSecondL2ToL1DmaOp = createL2ToL1ForReuse(
876-
rewriter, l2ToL1DmaOps[i + 1], reuseL1LogicalObjectFifoOp,
877-
newL2ObjectFifo, newL2AsSourceOffsets);
917+
createL2ToL1ForReuse(rewriter, l2ToL1DmaOps[i + 1],
918+
reuseL1LogicalObjectFifoOp, newL2ObjectFifo,
919+
newL2AsSourceOffsets);
878920

879-
// Step 3. PICK the CoreOps associated with the 1:1 L2->L1.
921+
// Step 4. Pick the CoreOps associated with the 1:1 L2->L1.
880922
// For the first Core op we'll insert Read at the end. It doesn't matter
881923
// for now so we're gonna insert it right before amdaie.end op.
882-
std::optional<CoreOp> maybeFirstCoreOp =
883-
fetchUniqueCoreOp(newFirstL2ToL1DmaOp);
884-
if (!maybeFirstCoreOp) return failure();
885-
CoreOp firstCoreOp = maybeFirstCoreOp.value();
886-
firstCoreOp.walk([&](AMDAIE::EndOp endOp) {
887-
OpBuilder::InsertionGuard guard(rewriter);
888-
// Hardcoding to `AMDAIE::MemoryAccess::Read`.
889-
rewriter.setInsertionPoint(endOp);
890-
rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
891-
rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
892-
AMDAIE::MemoryAccess::Read);
924+
firstCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
925+
if (accessOp.getInput() == newFirstL2ToL1DmaOp.getTargetObjectFifo()) {
926+
OpBuilder::InsertionGuard guard(rewriter);
927+
rewriter.setInsertionPointAfter(accessOp);
928+
rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
929+
rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
930+
accessOp.getAccessType());
931+
}
893932
});
894933
// For the second Core op we'll insert `Read` right before the first read
895934
// from the corresponding L1 logicalobjectFifo.
896-
std::optional<CoreOp> maybeSecondCoreOp =
897-
fetchUniqueCoreOp(newSecondL2ToL1DmaOp);
898-
if (!maybeSecondCoreOp) return failure();
899-
CoreOp secondCoreOp = maybeSecondCoreOp.value();
900935
secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
901936
if (accessOp.getInput() == l2ToL1DmaOps[i + 1].getTargetObjectFifo()) {
902937
OpBuilder::InsertionGuard guard(rewriter);

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir

+4-4
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,16 @@
3232
// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
3333
// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1]
3434
// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1]
35-
// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
35+
// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]], %[[TILE_0]]}
3636
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
3737
// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
3838
// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
3939
// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
4040
// CHECK: linalg.generic
4141
// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
42+
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
4243
// CHECK: linalg.generic
4344
// CHECK-SAME: %[[FIRST_READ]]
44-
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
4545
// CHECK: amdaie.end
4646
// CHECK: }
4747
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
@@ -55,16 +55,16 @@
5555
// CHECK-SAME: %[[SECOND_READ]]
5656
// CHECK: amdaie.end
5757
// CHECK: }
58-
// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]}
58+
// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]], %[[TILE_2]]}
5959
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd(
6060
// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
6161
// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
6262
// CHECK: amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out :
6363
// CHECK: linalg.generic
6464
// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
65+
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
6566
// CHECK: linalg.generic
6667
// CHECK-SAME: %[[FIRST_READ]]
67-
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
6868
// CHECK: amdaie.end
6969
// CHECK: }
7070
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(

0 commit comments

Comments
 (0)