Skip to content

Commit eb7a855

Browse files
wlei-llvmmeta-codesync[bot]
authored andcommitted
[AutoWS] Add Data Dependence Graph for modulo scheduling (#1225)
Summary: Pull Request resolved: #1225 Add the DDG builder that analyzes a loop body to create a dependence graph. Each node is an operation classified by its hardware pipeline (MEM/TC/CUDA/SFU) with latency from the LatencyModel. Edges represent data dependencies with latencies and loop-carried distances. Also includes ResMII/RecMII/MinII computation and critical path analysis. Inner scf.for loops are skipped — super-node modeling for outer loop scheduling is added in a follow-up diff. Authored with Claude. Reviewed By: htyu Differential Revision: D99955817 fbshipit-source-id: 95a99da5a8d4ebf1c09de179fb0af92fad7254ca
1 parent 2d83319 commit eb7a855

3 files changed

Lines changed: 361 additions & 0 deletions

File tree

third_party/nvidia/hopper/lib/Transforms/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ add_triton_library(NVHopperTransforms
1717
WarpSpecialization/WSTaskPartition.cpp
1818
WarpSpecialization/PartitionSchedulingMeta.cpp
1919
ModuloScheduling/LatencyModel.cpp
20+
ModuloScheduling/DataDependenceGraph.cpp
2021

2122
DEPENDS
2223
NVHopperTransformsIncGen
Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2+
3+
#include "DataDependenceGraph.h"
4+
5+
#include "mlir/Dialect/Arith/IR/Arith.h"
6+
#include "mlir/IR/BuiltinTypes.h"
7+
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
8+
#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
9+
#include "llvm/ADT/DenseSet.h"
10+
#include "llvm/ADT/SetVector.h"
11+
#include "llvm/Support/Debug.h"
12+
#include "llvm/Support/raw_ostream.h"
13+
#include <algorithm>
14+
#include <cmath>
15+
#include <queue>
16+
17+
#define DEBUG_TYPE "modulo-scheduling-ddg"
18+
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
19+
20+
namespace mlir::triton::gpu {
21+
22+
namespace ttng = mlir::triton::nvidia_gpu;
23+
24+
25+
unsigned DataDependenceGraph::addNode(Operation *op,
26+
const LatencyModel &model) {
27+
auto info = model.getLatency(op);
28+
unsigned idx = nodes.size();
29+
DDGNode node;
30+
node.op = op;
31+
node.idx = idx;
32+
node.pipeline = info.pipeline;
33+
node.latency = info.latency;
34+
node.selfLatency = info.selfLatency;
35+
nodes.push_back(node);
36+
opToIdx[op] = idx;
37+
return idx;
38+
}
39+
40+
void DataDependenceGraph::addEdge(unsigned src, unsigned dst, int latency,
41+
unsigned distance) {
42+
edges.push_back(DDGEdge{src, dst, latency, distance});
43+
nodes[src].succs.push_back(dst);
44+
nodes[dst].preds.push_back(src);
45+
}
46+
47+
DataDependenceGraph DataDependenceGraph::build(scf::ForOp loop,
48+
const LatencyModel &model) {
49+
DataDependenceGraph ddg;
50+
51+
// Phase 1: Create nodes for every op in the loop body (except terminator).
52+
auto &body = loop.getBody()->getOperations();
53+
for (auto &op : body) {
54+
if (op.hasTrait<OpTrait::IsTerminator>())
55+
continue;
56+
// Skip inner scf.for loops — this DDG handles flat loop bodies only.
57+
// Inner loop super-node modeling is added in a follow-up diff for
58+
// outer loop (persistent kernel) scheduling.
59+
if (isa<scf::ForOp>(op))
60+
continue;
61+
ddg.addNode(&op, model);
62+
}
63+
64+
// Phase 2: Intra-iteration edges from SSA def-use chains.
65+
for (auto &node : ddg.nodes) {
66+
for (auto operand : node.op->getOperands()) {
67+
auto *defOp = operand.getDefiningOp();
68+
if (!defOp || defOp->getNumResults() == 0)
69+
continue;
70+
auto it = ddg.opToIdx.find(defOp);
71+
if (it == ddg.opToIdx.end())
72+
continue;
73+
unsigned srcIdx = it->second;
74+
// Edge latency = producer's latency (time until result available).
75+
// Exception: for MEM → local_alloc edges, use selfLatency instead of
76+
// the full async latency. local_alloc is a format conversion (registers
77+
// → SMEM) that must stay at the same pipeline stage as its load.
78+
// The async overhead only applies to the MMA consumer, not local_alloc.
79+
int edgeLatency = ddg.nodes[srcIdx].latency;
80+
if (ddg.nodes[srcIdx].pipeline == HWPipeline::MEM &&
81+
isa<triton::gpu::LocalAllocOp>(node.op)) {
82+
edgeLatency = ddg.nodes[srcIdx].selfLatency;
83+
}
84+
ddg.addEdge(srcIdx, node.idx, edgeLatency, /*distance=*/0);
85+
}
86+
}
87+
88+
// Phase 3: Loop-carried edges via scf.yield → iter_args.
89+
auto yieldOp = loop.getBody()->getTerminator();
90+
auto iterArgs = loop.getRegionIterArgs();
91+
for (unsigned i = 0; i < yieldOp->getNumOperands(); ++i) {
92+
auto yieldVal = yieldOp->getOperand(i);
93+
auto *yieldDef = yieldVal.getDefiningOp();
94+
if (!yieldDef || yieldDef->getNumResults() == 0 ||
95+
ddg.opToIdx.count(yieldDef) == 0)
96+
continue;
97+
unsigned srcIdx = ddg.opToIdx[yieldDef];
98+
99+
// The iter_arg at position i receives yieldVal in the next iteration.
100+
// Find all users of that iter_arg within the loop body.
101+
if (i >= iterArgs.size())
102+
continue;
103+
auto iterArg = iterArgs[i];
104+
for (auto *user : iterArg.getUsers()) {
105+
if (user->hasTrait<OpTrait::IsTerminator>())
106+
continue;
107+
auto userIt = ddg.opToIdx.find(user);
108+
if (userIt == ddg.opToIdx.end())
109+
continue;
110+
ddg.addEdge(srcIdx, userIt->second, ddg.nodes[srcIdx].latency,
111+
/*distance=*/1);
112+
}
113+
}
114+
115+
LLVM_DEBUG({
116+
llvm::dbgs() << "[DDG] Built DDG with " << ddg.nodes.size() << " nodes, "
117+
<< ddg.edges.size() << " edges\n";
118+
ddg.dump();
119+
});
120+
121+
return ddg;
122+
}
123+
124+
llvm::SmallVector<const DDGEdge *>
125+
DataDependenceGraph::getInEdges(unsigned nodeIdx) const {
126+
llvm::SmallVector<const DDGEdge *> result;
127+
for (const auto &e : edges)
128+
if (e.dstIdx == nodeIdx)
129+
result.push_back(&e);
130+
return result;
131+
}
132+
133+
llvm::SmallVector<const DDGEdge *>
134+
DataDependenceGraph::getOutEdges(unsigned nodeIdx) const {
135+
llvm::SmallVector<const DDGEdge *> result;
136+
for (const auto &e : edges)
137+
if (e.srcIdx == nodeIdx)
138+
result.push_back(&e);
139+
return result;
140+
}
141+
142+
llvm::DenseMap<unsigned, int>
143+
DataDependenceGraph::computeCriticalPathHeights() const {
144+
llvm::DenseMap<unsigned, int> heights;
145+
llvm::DenseSet<unsigned> visiting; // cycle detection
146+
// Reverse topological order: process sinks first.
147+
// Use DFS-based approach since graph is small.
148+
std::function<int(unsigned)> computeHeight = [&](unsigned idx) -> int {
149+
auto it = heights.find(idx);
150+
if (it != heights.end())
151+
return it->second;
152+
// Guard against cycles in distance-0 edges. DDG construction guarantees
153+
// acyclicity, but this prevents infinite recursion if invariant is broken.
154+
if (!visiting.insert(idx).second)
155+
return 0;
156+
int maxSuccHeight = 0;
157+
for (const auto *edge : getOutEdges(idx)) {
158+
if (edge->distance > 0)
159+
continue; // skip loop-carried for critical path
160+
int succHeight = computeHeight(edge->dstIdx);
161+
maxSuccHeight = std::max(maxSuccHeight, edge->latency + succHeight);
162+
}
163+
visiting.erase(idx);
164+
heights[idx] = maxSuccHeight;
165+
return maxSuccHeight;
166+
};
167+
for (unsigned i = 0; i < nodes.size(); ++i)
168+
computeHeight(i);
169+
return heights;
170+
}
171+
172+
int DataDependenceGraph::computeResMII() const {
173+
llvm::DenseMap<HWPipeline, int> pipeLoad;
174+
for (const auto &node : nodes) {
175+
if (node.pipeline == HWPipeline::NONE)
176+
continue;
177+
pipeLoad[node.pipeline] += node.selfLatency;
178+
}
179+
int maxLoad = 0;
180+
for (auto &[pipe, load] : pipeLoad) {
181+
LLVM_DEBUG(llvm::dbgs() << "[DDG] Pipeline " << getPipelineName(pipe)
182+
<< " load: " << load << " cycles\n");
183+
maxLoad = std::max(maxLoad, load);
184+
}
185+
return maxLoad;
186+
}
187+
188+
int DataDependenceGraph::computeRecMII() const {
189+
// Compute RecMII = max over all recurrence circuits of ceil(sum_lat /
190+
// sum_dist).
191+
//
192+
// For each back-edge (distance > 0), find the longest forward path from
193+
// dst back to src. The recurrence latency = forward_path + back_edge_latency,
194+
// and distance = forward_distance + back_edge_distance. RecMII for that
195+
// circuit = ceil(total_lat / total_dist).
196+
//
197+
// We use Floyd-Warshall to compute longest forward paths (distance=0 edges
198+
// only), then combine with each back-edge.
199+
const unsigned N = nodes.size();
200+
if (N == 0)
201+
return 0;
202+
203+
// Forward-path longest latencies (only distance=0 edges).
204+
constexpr int NEG_INF = -1;
205+
std::vector<std::vector<int>> fwdLat(N, std::vector<int>(N, NEG_INF));
206+
207+
// Initialize with distance=0 edges only.
208+
for (const auto &e : edges) {
209+
if (e.distance == 0) {
210+
fwdLat[e.srcIdx][e.dstIdx] =
211+
std::max(fwdLat[e.srcIdx][e.dstIdx], e.latency);
212+
}
213+
}
214+
// Self-loops with distance 0.
215+
for (unsigned i = 0; i < N; ++i)
216+
fwdLat[i][i] = std::max(fwdLat[i][i], 0);
217+
218+
// Floyd-Warshall on forward paths.
219+
for (unsigned k = 0; k < N; ++k) {
220+
for (unsigned i = 0; i < N; ++i) {
221+
for (unsigned j = 0; j < N; ++j) {
222+
if (fwdLat[i][k] == NEG_INF || fwdLat[k][j] == NEG_INF)
223+
continue;
224+
int newLat = fwdLat[i][k] + fwdLat[k][j];
225+
if (newLat > fwdLat[i][j])
226+
fwdLat[i][j] = newLat;
227+
}
228+
}
229+
}
230+
231+
// For each back-edge, compute the recurrence ratio.
232+
int recMII = 0;
233+
for (const auto &e : edges) {
234+
if (e.distance == 0)
235+
continue;
236+
// Back-edge: src → dst with distance > 0.
237+
// Forward path: dst →...→ src (distance=0 edges).
238+
// Total recurrence: forward_lat + back_edge_lat, total_dist = e.distance.
239+
int forwardLat = fwdLat[e.dstIdx][e.srcIdx];
240+
if (forwardLat == NEG_INF)
241+
continue; // no forward path completes the circuit
242+
int totalLat = forwardLat + e.latency;
243+
int totalDist = e.distance;
244+
int rec = (totalLat + totalDist - 1) / totalDist; // ceil
245+
LLVM_DEBUG(llvm::dbgs() << "[DDG] Recurrence: back-edge " << e.srcIdx
246+
<< " -> " << e.dstIdx << " (dist=" << e.distance
247+
<< ") fwdLat=" << forwardLat << " totalLat="
248+
<< totalLat << " RecMII=" << rec << "\n");
249+
recMII = std::max(recMII, rec);
250+
}
251+
return recMII;
252+
}
253+
254+
int DataDependenceGraph::computeMinII() const {
255+
int resMII = computeResMII();
256+
int recMII = computeRecMII();
257+
int minII = std::max(resMII, recMII);
258+
LLVM_DEBUG(llvm::dbgs() << "[DDG] ResMII=" << resMII << " RecMII=" << recMII
259+
<< " MinII=" << minII << "\n");
260+
return minII;
261+
}
262+
263+
void DataDependenceGraph::dump() const {
264+
llvm::dbgs() << "=== DDG Dump ===\n";
265+
for (const auto &node : nodes) {
266+
llvm::dbgs() << " Node " << node.idx
267+
<< ": pipeline=" << getPipelineName(node.pipeline)
268+
<< " latency=" << node.latency
269+
<< " selfLatency=" << node.selfLatency;
270+
if (node.isSuperNode)
271+
llvm::dbgs() << " [SUPER-NODE innerII=" << node.innerII << "]";
272+
llvm::dbgs() << " op=";
273+
node.op->print(llvm::dbgs(), OpPrintingFlags().skipRegions());
274+
llvm::dbgs() << "\n";
275+
}
276+
for (const auto &edge : edges) {
277+
llvm::dbgs() << " Edge " << edge.srcIdx << " -> " << edge.dstIdx
278+
<< " latency=" << edge.latency << " distance=" << edge.distance
279+
<< "\n";
280+
}
281+
llvm::dbgs() << "=== End DDG ===\n";
282+
}
283+
284+
} // namespace mlir::triton::gpu
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#ifndef TRITON_NVIDIA_HOPPER_MODULO_SCHEDULING_DDG_H
2+
#define TRITON_NVIDIA_HOPPER_MODULO_SCHEDULING_DDG_H
3+
4+
#include "LatencyModel.h"
5+
6+
#include "mlir/Dialect/SCF/IR/SCF.h"
7+
#include "mlir/IR/Operation.h"
8+
#include "llvm/ADT/DenseMap.h"
9+
#include "llvm/ADT/SmallVector.h"
10+
11+
namespace mlir::triton::gpu {
12+
13+
struct DDGEdge {
14+
unsigned srcIdx{};
15+
unsigned dstIdx{};
16+
int latency{};
17+
unsigned distance{}; // 0 = intra-iteration, 1+ = loop-carried
18+
};
19+
20+
struct DDGNode {
21+
Operation *op{};
22+
unsigned idx{};
23+
HWPipeline pipeline{HWPipeline::NONE};
24+
int latency{};
25+
int selfLatency{};
26+
bool isSuperNode{false}; // True if this node represents an inner loop
27+
int innerII{0}; // If super-node, the inner loop's II
28+
int prologueLatency{0}; // If super-node, cycles before TC starts (MEM busy)
29+
llvm::SmallVector<unsigned> succs;
30+
llvm::SmallVector<unsigned> preds;
31+
};
32+
33+
/// Data Dependence Graph for one scf.for loop body.
34+
/// Captures both intra-iteration and loop-carried (distance-1) edges.
35+
class DataDependenceGraph {
36+
public:
37+
static DataDependenceGraph build(scf::ForOp loop, const LatencyModel &model);
38+
39+
llvm::ArrayRef<DDGNode> getNodes() const { return nodes; }
40+
llvm::ArrayRef<DDGEdge> getEdges() const { return edges; }
41+
const DDGNode &getNode(unsigned idx) const { return nodes[idx]; }
42+
unsigned getNumNodes() const { return nodes.size(); }
43+
44+
/// Get all incoming edges for a node.
45+
llvm::SmallVector<const DDGEdge *> getInEdges(unsigned nodeIdx) const;
46+
47+
/// Get all outgoing edges for a node.
48+
llvm::SmallVector<const DDGEdge *> getOutEdges(unsigned nodeIdx) const;
49+
50+
/// Compute critical-path height (bottom-up) from each node to any sink.
51+
llvm::DenseMap<unsigned, int> computeCriticalPathHeights() const;
52+
53+
/// Compute ResMII: max over all pipelines of total self-latency.
54+
int computeResMII() const;
55+
56+
/// Compute RecMII: max over all recurrence circuits of sum_lat / sum_dist.
57+
int computeRecMII() const;
58+
59+
/// Compute MinII = max(ResMII, RecMII).
60+
int computeMinII() const;
61+
62+
/// Dump the DDG to llvm::dbgs() for debugging.
63+
void dump() const;
64+
65+
private:
66+
llvm::SmallVector<DDGNode> nodes;
67+
llvm::SmallVector<DDGEdge> edges;
68+
llvm::DenseMap<Operation *, unsigned> opToIdx;
69+
70+
unsigned addNode(Operation *op, const LatencyModel &model);
71+
void addEdge(unsigned src, unsigned dst, int latency, unsigned distance);
72+
};
73+
74+
} // namespace mlir::triton::gpu
75+
76+
#endif // TRITON_NVIDIA_HOPPER_MODULO_SCHEDULING_DDG_H

0 commit comments

Comments
 (0)