@@ -834,18 +834,18 @@ LogicalResult combineLogicalObjectFifos(
834
834
// will make an attempt to combine the logical objectFifos as per the
835
835
// following algorithm :-
836
836
// a. Combine i-th and i+1-th L3->L2 DmaCpyNd ops.
837
- // b. Since step a would create a new L2 buffer (with combined shape), we
838
- // will
839
- // need to update the corresponding two L2->L1 Dma ops by indeed creating
840
- // new ones. NOTE: Both of these new L2->L1 Dma ops will be reusing the
841
- // same L1 buffers as well.
842
- // c. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1
843
- // Dma
844
- // ops and do the following :-
837
+ // b. Form reusable L1 buffer by assigning the cumulative tiles of the
838
+ // intended core ops.
839
+ // c. Since step a would create a new L2 buffer (with combined shape), we
840
+ // will need to update the corresponding two L2->L1 Dma ops by indeed
841
+ // creating new ones. NOTE: Both of these new L2-> L1 Dma ops will be
842
+ // reusing the same L1 buffers as well.
843
+ // d. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1
844
+ // Dma ops and do the following :-
845
845
// 1. For i-th CoreOp insert an AccessOp from the same L1 buffer towards
846
- // the end.
846
+ // the end.
847
847
// 2. For i+1-th CoreOp insert an AccessOp from the same L1 buffer right
848
- // before the corresponding AccessOp within the same CoreOp.
848
+ // before the corresponding AccessOp within the same CoreOp.
849
849
for (unsigned i = 0 , n = l3ToL2DmaOps.size (); i < n; i += 2 ) {
850
850
// Step 1. Combine the picked L3->L2 DmaCpyNd pair.
851
851
FailureOr<LogicalObjectFifoFromMemrefOp> maybeNewL2ObjectFifo =
@@ -855,14 +855,56 @@ LogicalResult combineLogicalObjectFifos(
855
855
LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
856
856
maybeNewL2ObjectFifo.value ();
857
857
858
- // Step 2. We now have need to create two L2->L1 ops since the size has
858
+ // Step 2. Form the reusable L1 buffer by assigning the cumulative tiles of
859
+ // the intended core ops.
860
+ LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
861
+ l2ToL1DmaOps[i].getTargetObjectFifo ();
862
+ SmallVector<Value> tiles;
863
+ auto addNewTileFrom = [&](CoreOp coreOp) -> LogicalResult {
864
+ OpBuilder::InsertionGuard guard (rewriter);
865
+ TileOp tileOp = coreOp.getTileOp ();
866
+ std::optional<int64_t > column = getConstantIntValue (tileOp.getCol ());
867
+ std::optional<int64_t > row = getConstantIntValue (tileOp.getRow ());
868
+ if (!column || !row) {
869
+ return coreOp.emitOpError () << " has non-constant tile location" ;
870
+ }
871
+ rewriter.setInsertionPoint (reuseL1LogicalObjectFifoOp);
872
+ auto colIndex = rewriter.create <arith::ConstantIndexOp>(
873
+ rewriter.getUnknownLoc (), *column);
874
+ auto rowIndex = rewriter.create <arith::ConstantIndexOp>(
875
+ rewriter.getUnknownLoc (), *row);
876
+ tileOp =
877
+ rewriter.create <TileOp>(rewriter.getUnknownLoc (), colIndex, rowIndex);
878
+ tiles.push_back (tileOp.getResult ());
879
+ return success ();
880
+ };
881
+ std::optional<CoreOp> maybeFirstCoreOp = fetchUniqueCoreOp (l2ToL1DmaOps[i]);
882
+ if (!maybeFirstCoreOp) return failure ();
883
+ CoreOp firstCoreOp = maybeFirstCoreOp.value ();
884
+ std::optional<CoreOp> maybeSecondCoreOp =
885
+ fetchUniqueCoreOp (l2ToL1DmaOps[i + 1 ]);
886
+ if (!maybeSecondCoreOp) return failure ();
887
+ CoreOp secondCoreOp = maybeSecondCoreOp.value ();
888
+ if (failed (addNewTileFrom (firstCoreOp)) ||
889
+ failed (addNewTileFrom (secondCoreOp))) {
890
+ return failure ();
891
+ }
892
+ llvm::sort (tiles.begin (), tiles.end (),
893
+ AMDAIE::TileOp::tileValueColumnAndRowComparator);
894
+ rewriter.setInsertionPoint (reuseL1LogicalObjectFifoOp);
895
+ reuseL1LogicalObjectFifoOp =
896
+ rewriter.replaceOpWithNewOp <LogicalObjectFifoFromMemrefOp>(
897
+ reuseL1LogicalObjectFifoOp,
898
+ cast<LogicalObjectFifoType>(
899
+ reuseL1LogicalObjectFifoOp.getOutput ().getType ()),
900
+ reuseL1LogicalObjectFifoOp.getMemref (), tiles);
901
+
902
+ // Step 3. We now have need to create two L2->L1 ops since the size has
859
903
// changed. But for this we first need to find the new offset for L2 as
860
904
// source.
861
905
// TODO: For now I'm hardcoding the offsets but later it'd just depend
862
906
// on combining/non-combining dimensions.
863
907
// Offset = 0,0
864
- LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
865
- l2ToL1DmaOps[i].getTargetObjectFifo ();
866
908
SmallVector<OpFoldResult> newL2AsSourceOffsets =
867
909
l2ToL1DmaOps[i].getSourceMixedOffsets ();
868
910
DmaCpyNdOp newFirstL2ToL1DmaOp = createL2ToL1ForReuse (
@@ -872,31 +914,24 @@ LogicalResult combineLogicalObjectFifos(
872
914
// the first L2->L1 Dma.
873
915
newL2AsSourceOffsets = l2ToL1DmaOps[i + 1 ].getSourceMixedOffsets ();
874
916
newL2AsSourceOffsets[1 ] = rewriter.getIndexAttr (1 );
875
- DmaCpyNdOp newSecondL2ToL1DmaOp = createL2ToL1ForReuse (
876
- rewriter, l2ToL1DmaOps[i + 1 ], reuseL1LogicalObjectFifoOp,
877
- newL2ObjectFifo, newL2AsSourceOffsets);
917
+ createL2ToL1ForReuse (rewriter, l2ToL1DmaOps[i + 1 ],
918
+ reuseL1LogicalObjectFifoOp, newL2ObjectFifo ,
919
+ newL2AsSourceOffsets);
878
920
879
- // Step 3. PICK the CoreOps associated with the 1:1 L2->L1.
921
+ // Step 4. Pick the CoreOps associated with the 1:1 L2->L1.
880
922
// For the first Core op we'll insert Read at the end. It doesn't matter
881
923
// for now so we're gonna insert it right before amdaie.end op.
882
- std::optional<CoreOp> maybeFirstCoreOp =
883
- fetchUniqueCoreOp (newFirstL2ToL1DmaOp);
884
- if (!maybeFirstCoreOp) return failure ();
885
- CoreOp firstCoreOp = maybeFirstCoreOp.value ();
886
- firstCoreOp.walk ([&](AMDAIE::EndOp endOp) {
887
- OpBuilder::InsertionGuard guard (rewriter);
888
- // Hardcoding to `AMDAIE::MemoryAccess::Read`.
889
- rewriter.setInsertionPoint (endOp);
890
- rewriter.create <AMDAIE::LogicalObjectFifoAccessOp>(
891
- rewriter.getUnknownLoc (), reuseL1LogicalObjectFifoOp.getOutput (),
892
- AMDAIE::MemoryAccess::Read);
924
+ firstCoreOp.walk ([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
925
+ if (accessOp.getInput () == newFirstL2ToL1DmaOp.getTargetObjectFifo ()) {
926
+ OpBuilder::InsertionGuard guard (rewriter);
927
+ rewriter.setInsertionPointAfter (accessOp);
928
+ rewriter.create <AMDAIE::LogicalObjectFifoAccessOp>(
929
+ rewriter.getUnknownLoc (), reuseL1LogicalObjectFifoOp.getOutput (),
930
+ accessOp.getAccessType ());
931
+ }
893
932
});
894
933
// For the second Core op we'll insert `Read` right before the first read
895
934
// from the corresponding L1 logicalobjectFifo.
896
- std::optional<CoreOp> maybeSecondCoreOp =
897
- fetchUniqueCoreOp (newSecondL2ToL1DmaOp);
898
- if (!maybeSecondCoreOp) return failure ();
899
- CoreOp secondCoreOp = maybeSecondCoreOp.value ();
900
935
secondCoreOp.walk ([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
901
936
if (accessOp.getInput () == l2ToL1DmaOps[i + 1 ].getTargetObjectFifo ()) {
902
937
OpBuilder::InsertionGuard guard (rewriter);
0 commit comments