[multi-gpu] Phase 3: air-rank-to-mgpu lowering pass

erwei-xilinx · claude · erwei-xilinx · commit 6875ed3fe32f · 2026-05-06T20:15:36.000Z
New conversion pass that replaces each `air.rank` op by inlining its body
in place, with rank IDs computed at runtime via `mgpuGetRank()` and
delinearized into the rank's N-D iteration space. Replaces
`air-rank-to-launch` for the GPU pipeline (which serialized ranks via
scf.for — a placeholder for single-process execution).

After this pass each process executes the entire `air.rank` body once,
with its rank id resolved dynamically from the runtime. Heap lifecycle
(`mgpuSymmetricHeapInit` / `mgpuSymmetricHeapDestroy`) is bracketed
around the parent function once per function (not per rank).

- `mlir/include/air/Conversion/AIRRankToMgpuPass.h` — public header
- `mlir/include/air/Conversion/GPUPasses.td` — `air-rank-to-mgpu` def
  with `heap-size` option (default 256 MB)
- `mlir/include/air/Conversion/GPUPassDetail.h` — `GEN_PASS_DEF_AIRRANKTOMGPU`
- `mlir/lib/Conversion/AIRRankToMgpuPass.cpp` — pass implementation
- `mlir/lib/Conversion/CMakeLists.txt`, `Passes.cpp` — registration
- `mlir/test/Conversion/AIRRankToMgpu/rank_to_mgpu.mlir` — FileCheck
  unit tests (10 cases; see Test plan below)
- `test/gpu/symmetric_heap_dma/air_sym_with_rank.mlir` — high-level
  air.rank-based equivalent of the Phase 2 hand-written reference
- `test/gpu/symmetric_heap_dma/run.sh` — `INPUT=rank|handwritten`
  selector to run either form through the same multi-process driver

FileCheck unit tests cover:
- 1D / 2D rank delinearization (remsi/divsi)
- Default + custom heap-size option
- Async form (token replacement via wait_all)
- Async dependencies (blocking wait_all insertion)
- Multiple `air.rank` ops per function (init/destroy emitted once)
- Multiple `func.return` paths (destroy before each)
- Kernel operand mapping (block args replaced by SSA operands)
- Idempotent extern decls across multiple functions
- No-op when no `air.rank` is present (audit-found bug fixed: pass was
  unconditionally inserting decls)

End-to-end: rad-mi300a-sh5-1, SHARE_GPU=1, 2 ranks, INPUT=rank — both
ranks PASS the cross-rank read.

Caveat: same SHARE_GPU=1 single-physical-GPU caveat as Phase 2. True
multi-GPU re-validation is needed before declaring multi-GPU production-
ready (blocked on ROCm-side work).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/mlir/include/air/Conversion/AIRRankToMgpuPass.h b/mlir/include/air/Conversion/AIRRankToMgpuPass.h
@@ -0,0 +1,22 @@
+//===- AIRRankToMgpuPass.h ---------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H
+#define AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRRankToMgpuPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H
diff --git a/mlir/include/air/Conversion/GPUPassDetail.h b/mlir/include/air/Conversion/GPUPassDetail.h
@@ -26,6 +26,7 @@ using namespace mlir;
 #define GEN_PASS_DEF_AIRTRANSLATETOLLVM
 #define GEN_PASS_DEF_CONVERTAIRTOROCDL
 #define GEN_PASS_DEF_CONVERTGPUKERNELOUTLINE
+#define GEN_PASS_DEF_AIRRANKTOMGPU
 #include "air/Conversion/GPUPasses.h.inc"
 
 } // namespace air
diff --git a/mlir/include/air/Conversion/GPUPasses.td b/mlir/include/air/Conversion/GPUPasses.td
@@ -49,4 +49,32 @@ def ConvertGPUKernelOutline : Pass<"air-gpu-outlining", "ModuleOp"> {
   let options = [];
 }
 
+def AIRRankToMgpu : Pass<"air-rank-to-mgpu", "ModuleOp"> {
+  let summary = "Lower air.rank to mgpu* runtime calls (multi-GPU process model)";
+  let constructor = "xilinx::air::createAIRRankToMgpuPass()";
+  let description = [{
+    Each `air.rank` op is replaced by inlining its body in place, with rank
+    IDs computed from `mgpuGetRank()` (delinearized into the rank's N-D
+    iteration space) and rank sizes substituted from the static size operands.
+
+    The pass also inserts `mgpuSymmetricHeapInit(heap_size)` at the entry of
+    the enclosing `func.func` (default 256 MB; configurable via the
+    `heap-size` option) and `mgpuSymmetricHeapDestroy()` before each
+    `func.return` in that function.
+
+    This replaces `air-rank-to-launch` for the GPU pipeline. Unlike
+    `air-rank-to-launch` (which serializes ranks via `scf.for`), this pass
+    assumes each process executes the whole rank body once and runtime
+    coordinates across processes via env vars (RANK / WORLD_SIZE / LOCAL_RANK)
+    and the symmetric-heap fabric.
+  }];
+  let options = [
+    Option<"heapSize", "heap-size", "uint64_t", "/*default=*/268435456",
+           "Symmetric heap size in bytes (default: 256 MB)">
+  ];
+  let dependentDialects = [
+    "func::FuncDialect", "arith::ArithDialect"
+  ];
+}
+
 #endif // AIR_CONVERSION_GPU_PASSES
diff --git a/mlir/lib/Conversion/AIRRankToMgpuPass.cpp b/mlir/lib/Conversion/AIRRankToMgpuPass.cpp
@@ -0,0 +1,181 @@
+//===- AIRRankToMgpuPass.cpp -----------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// Lower air.rank to mgpu* runtime calls (multi-GPU process model).
+//
+// Each `air.rank` op is replaced by inlining its body in place, with rank
+// IDs computed from `mgpuGetRank()` (delinearized into the rank's N-D
+// iteration space) and rank sizes substituted from the static size operands.
+//
+// The pass also inserts `mgpuSymmetricHeapInit(heap_size)` at the entry of
+// the enclosing `func.func` and `mgpuSymmetricHeapDestroy()` before each
+// `func.return` in that function.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "air/Conversion/AIRRankToMgpuPass.h"
+#include "air/Conversion/GPUPassDetail.h"
+#include "air/Dialect/AIR/AIRDialect.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace xilinx;
+
+namespace {
+
+// Ensure a private extern func declaration exists at the top of the module.
+static func::FuncOp ensureExternFunc(ModuleOp module, OpBuilder &builder,
+                                     StringRef name, FunctionType type) {
+  if (auto fn = module.lookupSymbol<func::FuncOp>(name))
+    return fn;
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(module.getBody());
+  auto fn = func::FuncOp::create(builder, module.getLoc(), name, type);
+  fn.setPrivate();
+  return fn;
+}
+
+struct AIRRankToMgpuPass
+    : public xilinx::air::impl::AIRRankToMgpuBase<AIRRankToMgpuPass> {
+
+  AIRRankToMgpuPass() = default;
+  AIRRankToMgpuPass(const AIRRankToMgpuPass &pass) {}
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    OpBuilder builder(module.getContext());
+    auto i32Ty = builder.getI32Type();
+    auto i64Ty = builder.getI64Type();
+    auto idxTy = builder.getIndexType();
+
+    // Collect all air.rank ops and their parent functions.
+    SmallVector<air::RankOp> rankOps;
+    SetVector<func::FuncOp> rankParentFuncs;
+    module.walk([&](air::RankOp op) {
+      rankOps.push_back(op);
+      if (auto fn = op->getParentOfType<func::FuncOp>())
+        rankParentFuncs.insert(fn);
+    });
+
+    // If no air.rank ops exist, leave the module untouched.
+    if (rankOps.empty())
+      return;
+
+    // Declare the mgpu* runtime ABI functions (only when needed).
+    auto initFn = ensureExternFunc(module, builder, "mgpuSymmetricHeapInit",
+                                    builder.getFunctionType({i64Ty}, {}));
+    auto destroyFn =
+        ensureExternFunc(module, builder, "mgpuSymmetricHeapDestroy",
+                          builder.getFunctionType({}, {}));
+    auto getRankFn = ensureExternFunc(module, builder, "mgpuGetRank",
+                                       builder.getFunctionType({}, {i32Ty}));
+
+    // For each parent function, insert mgpuSymmetricHeapInit at entry and
+    // mgpuSymmetricHeapDestroy before each return.
+    for (func::FuncOp fn : rankParentFuncs) {
+      if (fn.empty())
+        continue;
+      Block &entry = fn.front();
+      Location loc = fn.getLoc();
+      builder.setInsertionPointToStart(&entry);
+      Value heapSizeVal = arith::ConstantOp::create(
+          builder, loc, i64Ty,
+          builder.getI64IntegerAttr(static_cast<int64_t>(heapSize)));
+      func::CallOp::create(builder, loc, initFn, ValueRange{heapSizeVal});
+
+      // Insert destroy before every return op.
+      SmallVector<func::ReturnOp> returns;
+      fn.walk([&](func::ReturnOp r) { returns.push_back(r); });
+      for (func::ReturnOp r : returns) {
+        builder.setInsertionPoint(r);
+        func::CallOp::create(builder, r.getLoc(), destroyFn, ValueRange{});
+      }
+    }
+
+    // Lower each air.rank op.
+    for (air::RankOp rankOp : rankOps) {
+      builder.setInsertionPoint(rankOp);
+      Location loc = rankOp.getLoc();
+
+      // If the rank has async dependencies, insert a blocking wait before
+      // proceeding.
+      if (!rankOp.getAsyncDependencies().empty()) {
+        air::WaitAllOp::create(builder, loc, Type{},
+                                rankOp.getAsyncDependencies());
+      }
+
+      // Get the flat rank id from mgpuGetRank() and convert to index.
+      Value rankI32 =
+          func::CallOp::create(builder, loc, getRankFn, ValueRange{})
+              .getResult(0);
+      Value rankI64 =
+          arith::ExtSIOp::create(builder, loc, i64Ty, rankI32);
+      Value flatRank =
+          arith::IndexCastOp::create(builder, loc, idxTy, rankI64);
+
+      // Delinearize flatRank into N rank IDs using the static size operands.
+      // For sizes [s0, s1, ..., sn-1]:
+      //   id[0]   = flat % s0
+      //   id[1]   = (flat / s0) % s1
+      //   ...
+      //   id[n-1] = flat / (s0 * s1 * ... * sn-2)
+      auto sizeOpers = rankOp.getSizeOperands();
+      unsigned n = rankOp.getNumDims();
+      SmallVector<Value> ids(n);
+      Value remaining = flatRank;
+      for (unsigned d = 0; d < n; ++d) {
+        if (d == n - 1) {
+          ids[d] = remaining;
+        } else {
+          ids[d] = arith::RemSIOp::create(builder, loc, remaining, sizeOpers[d]);
+          remaining =
+              arith::DivSIOp::create(builder, loc, remaining, sizeOpers[d]);
+        }
+      }
+
+      // Build remap and clone the body.
+      IRMapping remap;
+      for (unsigned d = 0; d < n; ++d) {
+        remap.map(rankOp.getIds()[d], ids[d]);
+        remap.map(rankOp.getSize()[d], sizeOpers[d]);
+      }
+      for (unsigned i = 0; i < rankOp.getNumKernelOperands(); ++i)
+        remap.map(rankOp.getKernelArgument(i), rankOp.getKernelOperand(i));
+
+      auto &ops = rankOp.getBody().front().getOperations();
+      for (auto oi = ops.begin(), oe = --ops.end(); oi != oe; ++oi)
+        builder.clone(*oi, remap);
+
+      // Replace the async token (if any) with a synchronous wait_all.
+      if (rankOp.getAsyncToken()) {
+        auto waitAll = air::WaitAllOp::create(
+            builder, loc, air::AsyncTokenType::get(builder.getContext()),
+            ValueRange{});
+        rankOp.getAsyncToken().replaceAllUsesWith(waitAll.getAsyncToken());
+      }
+
+      rankOp.erase();
+    }
+  }
+};
+
+} // namespace
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRRankToMgpuPass() {
+  return std::make_unique<AIRRankToMgpuPass>();
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
@@ -57,6 +57,7 @@ if(AIR_ENABLE_GPU)
     AIRToROCDLPass.cpp
     AIRTranslateToLLVMPass.cpp
     GPUKernelOutlinePass.cpp
+    AIRRankToMgpuPass.cpp
   )
   list(APPEND CONVERSION_LINK_LIBS
     MLIRGPUDialect
diff --git a/mlir/lib/Conversion/Passes.cpp b/mlir/lib/Conversion/Passes.cpp
@@ -9,6 +9,7 @@
 #include "air/Conversion/Passes.h"
 
 #if AIR_ENABLE_GPU
+#include "air/Conversion/AIRRankToMgpuPass.h"
 #include "air/Conversion/AIRToROCDLPass.h"
 #include "air/Conversion/AIRTranslateToLLVMPass.h"
 #include "air/Conversion/GPUKernelOutlinePass.h"
diff --git a/mlir/test/Conversion/AIRRankToMgpu/rank_to_mgpu.mlir b/mlir/test/Conversion/AIRRankToMgpu/rank_to_mgpu.mlir
diff --git a/test/gpu/symmetric_heap_dma/air_sym_with_rank.mlir b/test/gpu/symmetric_heap_dma/air_sym_with_rank.mlir
diff --git a/test/gpu/symmetric_heap_dma/run.sh b/test/gpu/symmetric_heap_dma/run.sh

Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,7 @@ if(AIR_ENABLE_GPU)`
`57`	`57`	`AIRToROCDLPass.cpp`
`58`	`58`	`AIRTranslateToLLVMPass.cpp`
`59`	`59`	`GPUKernelOutlinePass.cpp`
	`60`	`+ AIRRankToMgpuPass.cpp`
`60`	`61`	`)`
`61`	`62`	`list(APPEND CONVERSION_LINK_LIBS`
`62`	`63`	`MLIRGPUDialect`