forked from Xilinx/mlir-air
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAIRToAIESchedulingUtils.h
More file actions
278 lines (224 loc) · 11 KB
/
AIRToAIESchedulingUtils.h
File metadata and controls
278 lines (224 loc) · 11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
//===- AIRToAIESchedulingUtils.h --------------------------------*- C++ -*-===//
//
// Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
// SPDX-License-Identifier: MIT
//
//===----------------------------------------------------------------------===//
#ifndef AIR_TO_AIE_SCHEDULING_UTILS_H
#define AIR_TO_AIE_SCHEDULING_UTILS_H
#include "air/Conversion/PassDetail.h"
#include "aie/Dialect/AIE/IR/AIEDialect.h"
#include "air/Dialect/AIR/AIRDialect.h"
#include "mlir/Pass/Pass.h"
using namespace mlir;
namespace xilinx {
namespace air {
FailureOr<bool> isTileInbound(air::MemcpyInterface memcpyOp,
air::MemorySpace tileMemSpace);
FailureOr<bool> isTileOutbound(air::MemcpyInterface memcpyOp,
air::MemorySpace tileMemSpace);
AIE::TileOp getPhysTileOpOrNull(AIE::DeviceOp aie_device, int col, int row);
// get tileop using physical coordinates
AIE::TileOp getPhysTileOp(AIE::DeviceOp aie_device, int col, int row);
// Materialize a physical aie.tile by emitting an aie.logical_tile<tileType>
// with the given hints (use std::nullopt for "?"), running mlir-aie's
// SequentialPlacer, and resolving the result through getPhysTileOp. On
// placement failure, emits a diagnostic on `aie_device` and returns failure.
//
// Caller must NOT be inside a greedy PatternRewriter callback; this helper
// uses plain OpBuilder + replaceAllUsesWith/erase, which would invalidate
// a greedy worklist's cached use-def edges (see RFC #1567 milestone 2).
mlir::FailureOr<AIE::TileOp> createTileViaPlacer(AIE::DeviceOp aie_device,
AIE::AIETileType tileType,
std::optional<int> col_hint,
std::optional<int> row_hint);
// Batched variant: emits N aie.logical_tile<tileType> ops (one per hint),
// runs the placer ONCE, and resolves each into a physical aie.tile. The
// returned vector parallels `hints`. Use this when multiple unconstrained
// or partially-constrained logical tiles must be placed together — e.g.,
// a herd of cores all asking (col, ?), which a per-tile placer would all
// map to the same row because state doesn't persist across place() calls.
mlir::LogicalResult createTilesViaPlacer(
AIE::DeviceOp aie_device, AIE::AIETileType tileType,
llvm::ArrayRef<std::pair<std::optional<int>, std::optional<int>>> hints,
llvm::SmallVectorImpl<AIE::TileOp> &outTiles);
AIE::LockOp allocateLockOp(AIE::DeviceOp aie_device, AIE::TileOp tile,
int init = 0, int id = -1,
StringAttr name = nullptr);
std::stringstream
generateBufferNameInStringStream(StringRef prefix, uint64_t &BufferId,
mlir::StringAttr attr = nullptr, int x = -1,
int y = -1);
AIE::ExternalBufferOp allocateExternalBufferOp(uint64_t &BufferId,
MemRefType memrefTy,
AIE::DeviceOp device,
mlir::StringAttr attr = nullptr,
int x = -1, int y = -1);
std::vector<unsigned> convertToStdVec(SmallVector<int64_t, 6> vec);
bool areIdenticalVectors(std::vector<unsigned> &a, std::vector<unsigned> &b);
int64_t get1DOffset(SmallVector<Value> memcpy_offsets,
SmallVector<Value> memcpy_strides);
// Given a vector of memcpy operations, return a map of their repeat counts,
// relative to a common ancestor region.
llvm::MapVector<int, llvm::SetVector<Operation *>>
getRepeatCounts(std::vector<Operation *> memcpy_ops);
std::vector<AIE::BDDimLayoutAttr>
getWrapsAndStrides(SmallVector<Value> memcpy_sizes,
SmallVector<Value> memcpy_strides, MLIRContext *ctx);
std::pair<int64_t, int64_t>
getLockValuePair(const AIE::AIETargetModel &targetModel, Value buffer_memref);
std::pair<int64_t, int64_t>
getLockValuePair(const AIE::AIETargetModel &targetModel, Value buffer_memref,
air::ChannelOp air_chan);
struct allocation_info_t {
AIE::TileOp dma_tile = nullptr;
int64_t col = -1;
int64_t row = -1;
AIE::DMAChannel dma_channel = {AIE::DMAChannelDir::MM2S, -1};
int64_t tile_channel = -1;
int packet_flow_id = -1; // Packet flow ID assigned during flow creation
std::vector<int32_t> dma_id;
std::vector<Operation *> memcpyOps;
bool valid();
AIE::TileOp getDmaTile();
bool foundAlloc(AIE::TileOp tile);
bool foundAlloc(AIE::TileOp tile, air::MemcpyInterface memcpyOp);
bool foundAlloc(AIE::TileOp tile, air::ChannelOp channel_op);
bool foundAlloc(AIE::TileOp tile, AIE::DMAChannel channel);
bool foundPacketFlowAllocInTile(AIE::TileOp tile);
bool foundAlloc(air::ChannelOp channel_op);
bool foundAlloc(AIE::DMAChannel channel);
// Column-keyed; row is implied (shim is always row 0).
bool foundAllocInColumn(int32_t col);
bool foundAllocInColumn(int32_t col, AIE::DMAChannel channel);
bool foundPacketFlowAllocInColumn(int32_t col);
bool operator==(const allocation_info_t &other) const {
return dma_tile == other.dma_tile && col == other.col && row == other.row &&
dma_channel == other.dma_channel &&
tile_channel == other.tile_channel;
}
};
// Bundling up memcpy ops into MM2S and S2MM ops sharing the same aie.flow
struct MemcpyBundleAsFlow {
Operation *air_flow_op; // Either air::DmaMemcpyNdOp or air::ChannelOp
int flow_op_group = -1; // Scheduling group index; (in scheduling strategy 2,
// flows of the same index can share DMA channels)
std::vector<allocation_info_t> S2MM_alloc;
std::vector<std::vector<Operation *>> S2MM;
allocation_info_t MM2S_alloc;
std::vector<Operation *> MM2S; // air::ChannelPuts
air::MemorySpace MM2S_memspace;
air::MemorySpace S2MM_memspace;
int numMM2SAllocs = 0;
int numS2MMAllocs = 0;
std::string
memcpyResourceType; // The type of mechanism used for the memcpy op,
// including dma_stream, dma_packet, and cascade.
LogicalResult pushBackMemcpyOpToBundle(air::DmaMemcpyNdOp memcpyOp);
LogicalResult pushBackMemcpyOpToBundle(air::ChannelGetOp memcpyOp);
LogicalResult pushBackMemcpyOpToBundle(air::ChannelPutOp memcpyOp);
LogicalResult pushBackMemcpyOpToBundle(air::ChannelInterface memcpyOp);
MemcpyBundleAsFlow(air::DmaMemcpyNdOp dmaMemcpyOp);
MemcpyBundleAsFlow(air::ChannelOp chan);
};
class DMAAllocator {
public:
DMAAllocator() = delete;
DMAAllocator(AIE::DeviceOp device, air::MemorySpace dmaMemorySpace)
: device(device), dmaMemorySpace(dmaMemorySpace) {}
FailureOr<allocation_info_t>
lookupDMAAllocation(AIE::TileOp tile, air::MemcpyInterface &memcpyOp);
FailureOr<std::pair<AIE::LockOp, AIE::LockOp>>
getLockForDMA(air::MemcpyInterface &memcpyOp, AIE::TileOp tile,
Operation *bufferOp, bool lockRaceConditionFix = false);
FailureOr<allocation_info_t>
allocNewDmaChannel(air::MemcpyInterface &memcpyOp, AIE::TileOp tile, int chan,
int col, int row, std::vector<int> dma_id);
void sortMemcpyOps(std::vector<Operation *> dma_memcpy_ops);
protected:
AIE::DeviceOp device;
air::MemorySpace dmaMemorySpace;
public:
std::vector<allocation_info_t> mm2s_allocs, s2mm_allocs;
std::vector<std::tuple<Operation *, air::ChannelOp, AIE::DMAChannel,
AIE::LockOp, AIE::LockOp>>
lock_allocation_list;
DenseMap<Value, std::pair<int, int>> passiveSideBufferUseCounters;
};
class TileDMAAllocator : public DMAAllocator {
public:
TileDMAAllocator(AIE::DeviceOp device)
: DMAAllocator(device, air::MemorySpace::L1) {}
// A very simple scheme to allocate channels for dma operations:
// <description>
FailureOr<allocation_info_t>
simpleDmaChannelAlloc(air::MemcpyInterface &memcpyOp, AIE::TileOp tile,
int chan = -1);
FailureOr<AIE::BufferOp> getBuffer(uint64_t, AIE::TileOp tile,
air::MemcpyInterface &memcpyOp);
};
class ShimDMAAllocator : public DMAAllocator {
public:
std::vector<int> dma_columns;
int shim_dma_channels;
ShimDMAAllocator(AIE::DeviceOp device);
FailureOr<allocation_info_t>
allocNewDmaChannel(air::MemcpyInterface &memcpyOp, int col, int row,
std::vector<Operation *> &dma_ops);
FailureOr<allocation_info_t>
allocNewDmaChannel(air::MemcpyInterface &memcpyOp,
allocation_info_t existing_alloc,
std::vector<Operation *> &dma_ops);
FailureOr<AIE::ExternalBufferOp> getBuffer(uint64_t &BufferId,
AIE::TileOp tile,
air::MemcpyInterface &memcpyOp);
FailureOr<air::allocation_info_t>
foundFlowReuseOpportunity(std::vector<MemcpyBundleAsFlow> memcpy_flows,
air::allocation_info_t alloc, bool isMM2S);
};
class MemTileDMAAllocator : public DMAAllocator {
public:
std::vector<int> memtile_dma_columns;
MemTileDMAAllocator(AIE::DeviceOp device);
FailureOr<allocation_info_t>
simpleDmaChannelAlloc(air::MemcpyInterface &memcpyOp, int chan = -1);
FailureOr<allocation_info_t>
simpleDmaChannelAlloc(air::MemcpyInterface &memcpyOp,
allocation_info_t &existing_alloc);
// tile derived from memcpyOp's buffer; param kept for signature uniformity.
FailureOr<AIE::BufferOp> getBuffer(uint64_t, AIE::TileOp tile,
air::MemcpyInterface &memcpyOp);
FailureOr<air::allocation_info_t>
foundFlowReuseOpportunity(std::vector<MemcpyBundleAsFlow> memcpy_flows,
air::allocation_info_t alloc, bool isMM2S);
};
class CascadeAllocator {
public:
CascadeAllocator() = delete;
CascadeAllocator(AIE::DeviceOp device)
: device(device), dmaMemorySpace(air::MemorySpace::L1) {}
FailureOr<allocation_info_t> coreCascadeAlloc(air::MemcpyInterface &memcpyOp);
FailureOr<allocation_info_t> allocNewCascade(air::MemcpyInterface &memcpyOp,
AIE::TileOp tile);
// tile derived from memcpyOp's buffer; param kept for signature uniformity.
FailureOr<AIE::BufferOp> getBuffer(uint64_t, AIE::TileOp tile,
air::MemcpyInterface &memcpyOp);
protected:
AIE::DeviceOp device;
air::MemorySpace dmaMemorySpace;
public:
std::vector<allocation_info_t> cascade_put_allocs, cascade_get_allocs;
};
LogicalResult
simpleDMAChannelAllocation(std::vector<MemcpyBundleAsFlow> &memcpy_flows,
ShimDMAAllocator &shim_dma_alloc,
MemTileDMAAllocator &memtile_dma_alloc,
TileDMAAllocator &tile_dma_alloc,
air::CascadeAllocator &core_cascade_alloc);
template <typename T>
int foundInVector(T item, std::vector<T> vec);
int getSCFForLoopDepth(Operation *o);
bool groupingMemcpysByLoop(std::vector<MemcpyBundleAsFlow> &memcpy_flows);
} // namespace air
} // namespace xilinx
#endif // AIR_TO_AIE_SCHEDULING_UTILS_H