[AutoWS] Add Data Dependence Graph for modulo scheduling (#1225)

wlei-llvm · meta-codesync[bot] · commit eb7a855493de · 2026-04-10T00:03:36.000-07:00
Summary: Pull Request resolved: #1225 Add the DDG builder that analyzes a loop body to create a dependence graph. Each node is an operation classified by its hardware pipeline (MEM/TC/CUDA/SFU) with latency from the LatencyModel. Edges represent data dependencies with latencies and loop-carried distances. Also includes ResMII/RecMII/MinII computation and critical path analysis. Inner scf.for loops are skipped — super-node modeling for outer loop scheduling is added in a follow-up diff. Authored with Claude. Reviewed By: htyu Differential Revision: D99955817 fbshipit-source-id: 95a99da5a8d4ebf1c09de179fb0af92fad7254ca
diff --git a/third_party/nvidia/hopper/lib/Transforms/CMakeLists.txt b/third_party/nvidia/hopper/lib/Transforms/CMakeLists.txt
@@ -17,6 +17,7 @@ add_triton_library(NVHopperTransforms
   WarpSpecialization/WSTaskPartition.cpp
   WarpSpecialization/PartitionSchedulingMeta.cpp
   ModuloScheduling/LatencyModel.cpp
+  ModuloScheduling/DataDependenceGraph.cpp
 
   DEPENDS
   NVHopperTransformsIncGen
diff --git a/third_party/nvidia/hopper/lib/Transforms/ModuloScheduling/DataDependenceGraph.cpp b/third_party/nvidia/hopper/lib/Transforms/ModuloScheduling/DataDependenceGraph.cpp
@@ -0,0 +1,284 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include "DataDependenceGraph.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cmath>
+#include <queue>
+
+#define DEBUG_TYPE "modulo-scheduling-ddg"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
+namespace mlir::triton::gpu {
+
+namespace ttng = mlir::triton::nvidia_gpu;
+
+
+unsigned DataDependenceGraph::addNode(Operation *op,
+                                      const LatencyModel &model) {
+  auto info = model.getLatency(op);
+  unsigned idx = nodes.size();
+  DDGNode node;
+  node.op = op;
+  node.idx = idx;
+  node.pipeline = info.pipeline;
+  node.latency = info.latency;
+  node.selfLatency = info.selfLatency;
+  nodes.push_back(node);
+  opToIdx[op] = idx;
+  return idx;
+}
+
+void DataDependenceGraph::addEdge(unsigned src, unsigned dst, int latency,
+                                  unsigned distance) {
+  edges.push_back(DDGEdge{src, dst, latency, distance});
+  nodes[src].succs.push_back(dst);
+  nodes[dst].preds.push_back(src);
+}
+
+DataDependenceGraph DataDependenceGraph::build(scf::ForOp loop,
+                                               const LatencyModel &model) {
+  DataDependenceGraph ddg;
+
+  // Phase 1: Create nodes for every op in the loop body (except terminator).
+  auto &body = loop.getBody()->getOperations();
+  for (auto &op : body) {
+    if (op.hasTrait<OpTrait::IsTerminator>())
+      continue;
+    // Skip inner scf.for loops — this DDG handles flat loop bodies only.
+    // Inner loop super-node modeling is added in a follow-up diff for
+    // outer loop (persistent kernel) scheduling.
+    if (isa<scf::ForOp>(op))
+      continue;
+    ddg.addNode(&op, model);
+  }
+
+  // Phase 2: Intra-iteration edges from SSA def-use chains.
+  for (auto &node : ddg.nodes) {
+    for (auto operand : node.op->getOperands()) {
+      auto *defOp = operand.getDefiningOp();
+      if (!defOp || defOp->getNumResults() == 0)
+        continue;
+      auto it = ddg.opToIdx.find(defOp);
+      if (it == ddg.opToIdx.end())
+        continue;
+      unsigned srcIdx = it->second;
+      // Edge latency = producer's latency (time until result available).
+      // Exception: for MEM → local_alloc edges, use selfLatency instead of
+      // the full async latency. local_alloc is a format conversion (registers
+      // → SMEM) that must stay at the same pipeline stage as its load.
+      // The async overhead only applies to the MMA consumer, not local_alloc.
+      int edgeLatency = ddg.nodes[srcIdx].latency;
+      if (ddg.nodes[srcIdx].pipeline == HWPipeline::MEM &&
+          isa<triton::gpu::LocalAllocOp>(node.op)) {
+        edgeLatency = ddg.nodes[srcIdx].selfLatency;
+      }
+      ddg.addEdge(srcIdx, node.idx, edgeLatency, /*distance=*/0);
+    }
+  }
+
+  // Phase 3: Loop-carried edges via scf.yield → iter_args.
+  auto yieldOp = loop.getBody()->getTerminator();
+  auto iterArgs = loop.getRegionIterArgs();
+  for (unsigned i = 0; i < yieldOp->getNumOperands(); ++i) {
+    auto yieldVal = yieldOp->getOperand(i);
+    auto *yieldDef = yieldVal.getDefiningOp();
+    if (!yieldDef || yieldDef->getNumResults() == 0 ||
+        ddg.opToIdx.count(yieldDef) == 0)
+      continue;
+    unsigned srcIdx = ddg.opToIdx[yieldDef];
+
+    // The iter_arg at position i receives yieldVal in the next iteration.
+    // Find all users of that iter_arg within the loop body.
+    if (i >= iterArgs.size())
+      continue;
+    auto iterArg = iterArgs[i];
+    for (auto *user : iterArg.getUsers()) {
+      if (user->hasTrait<OpTrait::IsTerminator>())
+        continue;
+      auto userIt = ddg.opToIdx.find(user);
+      if (userIt == ddg.opToIdx.end())
+        continue;
+      ddg.addEdge(srcIdx, userIt->second, ddg.nodes[srcIdx].latency,
+                  /*distance=*/1);
+    }
+  }
+
+  LLVM_DEBUG({
+    llvm::dbgs() << "[DDG] Built DDG with " << ddg.nodes.size() << " nodes, "
+                 << ddg.edges.size() << " edges\n";
+    ddg.dump();
+  });
+
+  return ddg;
+}
+
+llvm::SmallVector<const DDGEdge *>
+DataDependenceGraph::getInEdges(unsigned nodeIdx) const {
+  llvm::SmallVector<const DDGEdge *> result;
+  for (const auto &e : edges)
+    if (e.dstIdx == nodeIdx)
+      result.push_back(&e);
+  return result;
+}
+
+llvm::SmallVector<const DDGEdge *>
+DataDependenceGraph::getOutEdges(unsigned nodeIdx) const {
+  llvm::SmallVector<const DDGEdge *> result;
+  for (const auto &e : edges)
+    if (e.srcIdx == nodeIdx)
+      result.push_back(&e);
+  return result;
+}
+
+llvm::DenseMap<unsigned, int>
+DataDependenceGraph::computeCriticalPathHeights() const {
+  llvm::DenseMap<unsigned, int> heights;
+  llvm::DenseSet<unsigned> visiting; // cycle detection
+  // Reverse topological order: process sinks first.
+  // Use DFS-based approach since graph is small.
+  std::function<int(unsigned)> computeHeight = [&](unsigned idx) -> int {
+    auto it = heights.find(idx);
+    if (it != heights.end())
+      return it->second;
+    // Guard against cycles in distance-0 edges. DDG construction guarantees
+    // acyclicity, but this prevents infinite recursion if invariant is broken.
+    if (!visiting.insert(idx).second)
+      return 0;
+    int maxSuccHeight = 0;
+    for (const auto *edge : getOutEdges(idx)) {
+      if (edge->distance > 0)
+        continue; // skip loop-carried for critical path
+      int succHeight = computeHeight(edge->dstIdx);
+      maxSuccHeight = std::max(maxSuccHeight, edge->latency + succHeight);
+    }
+    visiting.erase(idx);
+    heights[idx] = maxSuccHeight;
+    return maxSuccHeight;
+  };
+  for (unsigned i = 0; i < nodes.size(); ++i)
+    computeHeight(i);
+  return heights;
+}
+
+int DataDependenceGraph::computeResMII() const {
+  llvm::DenseMap<HWPipeline, int> pipeLoad;
+  for (const auto &node : nodes) {
+    if (node.pipeline == HWPipeline::NONE)
+      continue;
+    pipeLoad[node.pipeline] += node.selfLatency;
+  }
+  int maxLoad = 0;
+  for (auto &[pipe, load] : pipeLoad) {
+    LLVM_DEBUG(llvm::dbgs() << "[DDG] Pipeline " << getPipelineName(pipe)
+                            << " load: " << load << " cycles\n");
+    maxLoad = std::max(maxLoad, load);
+  }
+  return maxLoad;
+}
+
+int DataDependenceGraph::computeRecMII() const {
+  // Compute RecMII = max over all recurrence circuits of ceil(sum_lat /
+  // sum_dist).
+  //
+  // For each back-edge (distance > 0), find the longest forward path from
+  // dst back to src. The recurrence latency = forward_path + back_edge_latency,
+  // and distance = forward_distance + back_edge_distance. RecMII for that
+  // circuit = ceil(total_lat / total_dist).
+  //
+  // We use Floyd-Warshall to compute longest forward paths (distance=0 edges
+  // only), then combine with each back-edge.
+  const unsigned N = nodes.size();
+  if (N == 0)
+    return 0;
+
+  // Forward-path longest latencies (only distance=0 edges).
+  constexpr int NEG_INF = -1;
+  std::vector<std::vector<int>> fwdLat(N, std::vector<int>(N, NEG_INF));
+
+  // Initialize with distance=0 edges only.
+  for (const auto &e : edges) {
+    if (e.distance == 0) {
+      fwdLat[e.srcIdx][e.dstIdx] =
+          std::max(fwdLat[e.srcIdx][e.dstIdx], e.latency);
+    }
+  }
+  // Self-loops with distance 0.
+  for (unsigned i = 0; i < N; ++i)
+    fwdLat[i][i] = std::max(fwdLat[i][i], 0);
+
+  // Floyd-Warshall on forward paths.
+  for (unsigned k = 0; k < N; ++k) {
+    for (unsigned i = 0; i < N; ++i) {
+      for (unsigned j = 0; j < N; ++j) {
+        if (fwdLat[i][k] == NEG_INF || fwdLat[k][j] == NEG_INF)
+          continue;
+        int newLat = fwdLat[i][k] + fwdLat[k][j];
+        if (newLat > fwdLat[i][j])
+          fwdLat[i][j] = newLat;
+      }
+    }
+  }
+
+  // For each back-edge, compute the recurrence ratio.
+  int recMII = 0;
+  for (const auto &e : edges) {
+    if (e.distance == 0)
+      continue;
+    // Back-edge: src → dst with distance > 0.
+    // Forward path: dst →...→ src (distance=0 edges).
+    // Total recurrence: forward_lat + back_edge_lat, total_dist = e.distance.
+    int forwardLat = fwdLat[e.dstIdx][e.srcIdx];
+    if (forwardLat == NEG_INF)
+      continue; // no forward path completes the circuit
+    int totalLat = forwardLat + e.latency;
+    int totalDist = e.distance;
+    int rec = (totalLat + totalDist - 1) / totalDist; // ceil
+    LLVM_DEBUG(llvm::dbgs() << "[DDG] Recurrence: back-edge " << e.srcIdx
+                            << " -> " << e.dstIdx << " (dist=" << e.distance
+                            << ") fwdLat=" << forwardLat << " totalLat="
+                            << totalLat << " RecMII=" << rec << "\n");
+    recMII = std::max(recMII, rec);
+  }
+  return recMII;
+}
+
+int DataDependenceGraph::computeMinII() const {
+  int resMII = computeResMII();
+  int recMII = computeRecMII();
+  int minII = std::max(resMII, recMII);
+  LLVM_DEBUG(llvm::dbgs() << "[DDG] ResMII=" << resMII << " RecMII=" << recMII
+                          << " MinII=" << minII << "\n");
+  return minII;
+}
+
+void DataDependenceGraph::dump() const {
+  llvm::dbgs() << "=== DDG Dump ===\n";
+  for (const auto &node : nodes) {
+    llvm::dbgs() << "  Node " << node.idx
+                 << ": pipeline=" << getPipelineName(node.pipeline)
+                 << " latency=" << node.latency
+                 << " selfLatency=" << node.selfLatency;
+    if (node.isSuperNode)
+      llvm::dbgs() << " [SUPER-NODE innerII=" << node.innerII << "]";
+    llvm::dbgs() << " op=";
+    node.op->print(llvm::dbgs(), OpPrintingFlags().skipRegions());
+    llvm::dbgs() << "\n";
+  }
+  for (const auto &edge : edges) {
+    llvm::dbgs() << "  Edge " << edge.srcIdx << " -> " << edge.dstIdx
+                 << " latency=" << edge.latency << " distance=" << edge.distance
+                 << "\n";
+  }
+  llvm::dbgs() << "=== End DDG ===\n";
+}
+
+} // namespace mlir::triton::gpu
diff --git a/third_party/nvidia/hopper/lib/Transforms/ModuloScheduling/DataDependenceGraph.h b/third_party/nvidia/hopper/lib/Transforms/ModuloScheduling/DataDependenceGraph.h
@@ -0,0 +1,76 @@
+#ifndef TRITON_NVIDIA_HOPPER_MODULO_SCHEDULING_DDG_H
+#define TRITON_NVIDIA_HOPPER_MODULO_SCHEDULING_DDG_H
+
+#include "LatencyModel.h"
+
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir::triton::gpu {
+
+struct DDGEdge {
+  unsigned srcIdx{};
+  unsigned dstIdx{};
+  int latency{};
+  unsigned distance{}; // 0 = intra-iteration, 1+ = loop-carried
+};
+
+struct DDGNode {
+  Operation *op{};
+  unsigned idx{};
+  HWPipeline pipeline{HWPipeline::NONE};
+  int latency{};
+  int selfLatency{};
+  bool isSuperNode{false}; // True if this node represents an inner loop
+  int innerII{0};           // If super-node, the inner loop's II
+  int prologueLatency{0};   // If super-node, cycles before TC starts (MEM busy)
+  llvm::SmallVector<unsigned> succs;
+  llvm::SmallVector<unsigned> preds;
+};
+
+/// Data Dependence Graph for one scf.for loop body.
+/// Captures both intra-iteration and loop-carried (distance-1) edges.
+class DataDependenceGraph {
+public:
+  static DataDependenceGraph build(scf::ForOp loop, const LatencyModel &model);
+
+  llvm::ArrayRef<DDGNode> getNodes() const { return nodes; }
+  llvm::ArrayRef<DDGEdge> getEdges() const { return edges; }
+  const DDGNode &getNode(unsigned idx) const { return nodes[idx]; }
+  unsigned getNumNodes() const { return nodes.size(); }
+
+  /// Get all incoming edges for a node.
+  llvm::SmallVector<const DDGEdge *> getInEdges(unsigned nodeIdx) const;
+
+  /// Get all outgoing edges for a node.
+  llvm::SmallVector<const DDGEdge *> getOutEdges(unsigned nodeIdx) const;
+
+  /// Compute critical-path height (bottom-up) from each node to any sink.
+  llvm::DenseMap<unsigned, int> computeCriticalPathHeights() const;
+
+  /// Compute ResMII: max over all pipelines of total self-latency.
+  int computeResMII() const;
+
+  /// Compute RecMII: max over all recurrence circuits of sum_lat / sum_dist.
+  int computeRecMII() const;
+
+  /// Compute MinII = max(ResMII, RecMII).
+  int computeMinII() const;
+
+  /// Dump the DDG to llvm::dbgs() for debugging.
+  void dump() const;
+
+private:
+  llvm::SmallVector<DDGNode> nodes;
+  llvm::SmallVector<DDGEdge> edges;
+  llvm::DenseMap<Operation *, unsigned> opToIdx;
+
+  unsigned addNode(Operation *op, const LatencyModel &model);
+  void addEdge(unsigned src, unsigned dst, int latency, unsigned distance);
+};
+
+} // namespace mlir::triton::gpu
+
+#endif // TRITON_NVIDIA_HOPPER_MODULO_SCHEDULING_DDG_H