[multi-gpu] Phase 5: air-cross-rank-dma-to-mgpu lowering pass

erwei-xilinx · claude · erwei-xilinx · commit dae66d7306cd · 2026-05-06T04:24:35.000Z
New conversion pass that lowers `air.dma_memcpy_nd` ops carrying a `src_rank` or `dst_rank` integer attribute (added in Phase 1) to host-side `mgpuMemcpy` calls with peer-VA addressing through `mgpuGetHeapBases()`. The peer pointer is computed at runtime as: peer_ptr = bases[peer_rank] + (local_ptr - bases[my_rank]) where `local_ptr` is extracted from the local-side memref via `memref.extract_aligned_pointer_as_index` and `local_base = bases[my_rank]` gives this rank's symmetric heap base. ## Restrictions (this initial version) - Both `src` and `dst` memrefs must be in `memory_space=0` (L3/global) - The op must be at host scope (not inside a `gpu.launch` or `gpu.func`) - "Entire memref" form only — no explicit `[offsets][sizes][strides]` - Only one of `src_rank` / `dst_rank` may be set per op These restrictions match the hand-written reference's Phase 2 pattern. They can be relaxed in follow-up work. ## Files - `mlir/include/air/Conversion/AIRCrossRankDmaToMgpuPass.h` — header - `mlir/include/air/Conversion/GPUPasses.td` — `air-cross-rank-dma-to-mgpu` def - `mlir/include/air/Conversion/GPUPassDetail.h` — `GEN_PASS_DEF_AIRCROSSRANKDMATOMGPU` - `mlir/lib/Conversion/AIRCrossRankDmaToMgpuPass.cpp` — implementation - `mlir/lib/Conversion/{CMakeLists.txt,Passes.cpp}` — registration - `mlir/test/Conversion/AIRCrossRankDmaToMgpu/cross_rank_dma.mlir` — FileCheck - `test/gpu/symmetric_heap_dma/air_sym_with_dma.mlir` — high-level e2e combining Phase 1 attrs + Phase 3 + Phase 4 + Phase 5 lowering - `test/gpu/symmetric_heap_dma/run.sh` — adds `INPUT=dma` selector ## Test plan FileCheck unit tests cover: - src_rank lowering shape (size, ptr extraction, bases, GEP, ptrtoint, subi, byte-stride GEP, mgpuMemcpy) - dst_rank lowering (peer pointer becomes dst arg) - 2D memref byte size - f64 element type byte size - Multiple cross-rank DMAs share extern decls - Pass is a no-op for non-cross-rank DMAs End-to-end on rad-mi300a-sh5-1 (SHARE_GPU=1, 2 ranks): - INPUT=handwritten — PASS (Phase 2 baseline) - INPUT=rank — PASS (Phase 3) - INPUT=alloc — PASS (Phase 4) - INPUT=dma — PASS (Phase 5: chains Phase 5 -> Phase 4 -> Phase 3) Both ranks read rank 0's symmetric src_buf via cross-rank DMA into their own dst_buf; verification reads back 1.0. Same SHARE_GPU=1 single-physical-GPU caveat as Xilinx#1577 / Xilinx#1578 / Xilinx#1579 — true multi-GPU re-validation is needed before declaring multi-GPU production-ready. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/mlir/include/air/Conversion/AIRCrossRankDmaToMgpuPass.h b/mlir/include/air/Conversion/AIRCrossRankDmaToMgpuPass.h
@@ -0,0 +1,22 @@
+//===- AIRCrossRankDmaToMgpuPass.h ------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H
+#define AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRCrossRankDmaToMgpuPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H
diff --git a/mlir/include/air/Conversion/GPUPassDetail.h b/mlir/include/air/Conversion/GPUPassDetail.h
@@ -28,6 +28,7 @@ using namespace mlir;
 #define GEN_PASS_DEF_CONVERTGPUKERNELOUTLINE
 #define GEN_PASS_DEF_AIRRANKTOMGPU
 #define GEN_PASS_DEF_AIRSYMMETRICALLOCTOMGPU
+#define GEN_PASS_DEF_AIRCROSSRANKDMATOMGPU
 #include "air/Conversion/GPUPasses.h.inc"
 
 } // namespace air
diff --git a/mlir/include/air/Conversion/GPUPasses.td b/mlir/include/air/Conversion/GPUPasses.td
@@ -49,6 +49,31 @@ def ConvertGPUKernelOutline : Pass<"air-gpu-outlining", "ModuleOp"> {
   let options = [];
 }
 
+def AIRCrossRankDmaToMgpu : Pass<"air-cross-rank-dma-to-mgpu", "ModuleOp"> {
+  let summary = "Lower air.dma_memcpy_nd with src_rank/dst_rank to mgpuMemcpy "
+                "with peer-VA addressing through mgpuGetHeapBases()";
+  let constructor = "xilinx::air::createAIRCrossRankDmaToMgpuPass()";
+  let description = [{
+    For each `air.dma_memcpy_nd` op carrying a `src_rank` or `dst_rank`
+    integer attribute, emit a host-side `mgpuMemcpy` whose peer-side pointer
+    is computed as `mgpuGetHeapBases()[peer] + (local_ptr - local_base)`.
+
+    Restrictions in this initial version:
+      - Both `src` and `dst` memrefs must be in `memory_space=0`.
+      - The op must be at host scope (not inside any `gpu.launch`/`gpu.func`).
+      - "Entire memref" form only: `[]` `[]` `[]` for both sides — no
+        custom offsets / sizes / strides.
+
+    Lower this pass *before* `air-symmetric-alloc-to-mgpu` so that pointer
+    extraction (`memref.extract_aligned_pointer_as_index`) sees plain
+    memrefs rather than already-cast LLVM struct values.
+  }];
+  let dependentDialects = [
+    "func::FuncDialect", "arith::ArithDialect", "memref::MemRefDialect",
+    "LLVM::LLVMDialect"
+  ];
+}
+
 def AIRSymmetricAllocToMgpu : Pass<"air-symmetric-alloc-to-mgpu", "ModuleOp"> {
   let summary = "Lower memref.alloc {air.symmetric} to mgpuSymmetricAlloc and "
                 "memref.dealloc of the result to mgpuSymmetricFree";
diff --git a/mlir/lib/Conversion/AIRCrossRankDmaToMgpuPass.cpp b/mlir/lib/Conversion/AIRCrossRankDmaToMgpuPass.cpp
@@ -0,0 +1,247 @@
+//===- AIRCrossRankDmaToMgpuPass.cpp ---------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// Lower air.dma_memcpy_nd ops carrying a `src_rank` or `dst_rank` integer
+// attribute to host-side mgpuMemcpy calls with peer-VA addressing through
+// mgpuGetHeapBases().
+//
+// Pattern emitted (for src_rank = R):
+//   %size       = arith.constant <bytes> : i64
+//   %nullptr    = llvm.mlir.zero : !llvm.ptr
+//   %dst_ptr    = (extract aligned ptr from %dst memref)
+//   %src_ptr    = (extract aligned ptr from %src memref)
+//   %my_rank    = call @mgpuGetRank() : () -> i32
+//   %bases      = call @mgpuGetHeapBases() : () -> !llvm.ptr
+//   %my_base_at = llvm.getelementptr %bases[%my_rank] : ... -> !llvm.ptr, !llvm.ptr
+//   %my_base    = llvm.load %my_base_at : !llvm.ptr -> !llvm.ptr
+//   %src_int    = llvm.ptrtoint %src_ptr  : !llvm.ptr to i64
+//   %my_base_int = llvm.ptrtoint %my_base : !llvm.ptr to i64
+//   %offset     = arith.subi %src_int, %my_base_int : i64
+//   %peer_base_at = llvm.getelementptr %bases[<R>] : ... -> !llvm.ptr, !llvm.ptr
+//   %peer_base    = llvm.load %peer_base_at : !llvm.ptr -> !llvm.ptr
+//   %peer_src     = llvm.getelementptr %peer_base[%offset] : ... -> !llvm.ptr, i8
+//   call @mgpuMemcpy(%dst_ptr, %peer_src, %size, %nullptr)
+//
+// Initial restrictions:
+//   - Both memrefs must have memory_space=0 (L3/global).
+//   - Op must be at host scope (not inside a gpu.launch / gpu.func).
+//   - "Entire memref" form only: empty offsets/sizes/strides on both sides.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "air/Conversion/AIRCrossRankDmaToMgpuPass.h"
+#include "air/Conversion/GPUPassDetail.h"
+#include "air/Dialect/AIR/AIRDialect.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace xilinx;
+
+namespace {
+
+// Ensure a private extern func declaration exists at module scope.
+static func::FuncOp ensureExternFunc(ModuleOp module, OpBuilder &builder,
+                                     StringRef name, FunctionType type) {
+  if (auto fn = module.lookupSymbol<func::FuncOp>(name))
+    return fn;
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(module.getBody());
+  auto fn = func::FuncOp::create(builder, module.getLoc(), name, type);
+  fn.setPrivate();
+  return fn;
+}
+
+// Compute byte size of a static-shape memref as an i64 SSA value.
+static Value computeMemrefByteSize(OpBuilder &b, Location loc, MemRefType ty) {
+  if (!ty.hasStaticShape())
+    return nullptr;
+  int64_t numElts = 1;
+  for (int64_t d : ty.getShape())
+    numElts *= d;
+  unsigned eltBits = ty.getElementType().getIntOrFloatBitWidth();
+  if (eltBits == 0 || (eltBits % 8) != 0)
+    return nullptr;
+  int64_t totalBytes = numElts * (eltBits / 8);
+  return arith::ConstantOp::create(b, loc, b.getI64Type(),
+                                   b.getI64IntegerAttr(totalBytes));
+}
+
+// Extract an aligned !llvm.ptr from a memref via the standard idiom.
+static Value extractAlignedPtr(OpBuilder &b, Location loc, Value memref) {
+  Value idx = memref::ExtractAlignedPointerAsIndexOp::create(b, loc, memref);
+  Value i64 = arith::IndexCastOp::create(b, loc, b.getI64Type(), idx);
+  auto ptrTy = LLVM::LLVMPointerType::get(b.getContext());
+  return LLVM::IntToPtrOp::create(b, loc, ptrTy, i64);
+}
+
+struct AIRCrossRankDmaToMgpuPass
+    : public xilinx::air::impl::AIRCrossRankDmaToMgpuBase<
+          AIRCrossRankDmaToMgpuPass> {
+
+  AIRCrossRankDmaToMgpuPass() = default;
+  AIRCrossRankDmaToMgpuPass(const AIRCrossRankDmaToMgpuPass &) {}
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    OpBuilder builder(module.getContext());
+    auto i32Ty = builder.getI32Type();
+    auto i64Ty = builder.getI64Type();
+    auto ptrTy = LLVM::LLVMPointerType::get(module.getContext());
+
+    // Collect cross-rank DMA ops.
+    SmallVector<air::DmaMemcpyNdOp> crossRankDmas;
+    module.walk([&](air::DmaMemcpyNdOp op) {
+      if (op.hasCrossRank())
+        crossRankDmas.push_back(op);
+    });
+    if (crossRankDmas.empty())
+      return;
+
+    // Declare the runtime ABI functions we may need.
+    auto getRankFn = ensureExternFunc(module, builder, "mgpuGetRank",
+                                       builder.getFunctionType({}, {i32Ty}));
+    auto getBasesFn =
+        ensureExternFunc(module, builder, "mgpuGetHeapBases",
+                          builder.getFunctionType({}, {ptrTy}));
+    auto memcpyFn = ensureExternFunc(
+        module, builder, "mgpuMemcpy",
+        builder.getFunctionType({ptrTy, ptrTy, i64Ty, ptrTy}, {}));
+
+    for (air::DmaMemcpyNdOp dma : crossRankDmas) {
+      Location loc = dma.getLoc();
+
+      // Restrictions
+      if (dma->getParentOfType<gpu::LaunchOp>() ||
+          dma->getParentOfType<gpu::GPUFuncOp>()) {
+        dma.emitOpError(
+            "cross-rank DMA inside a GPU kernel is not yet supported");
+        signalPassFailure();
+        return;
+      }
+      if (!dma.getSrcOffsets().empty() || !dma.getSrcSizes().empty() ||
+          !dma.getSrcStrides().empty() || !dma.getDstOffsets().empty() ||
+          !dma.getDstSizes().empty() || !dma.getDstStrides().empty()) {
+        dma.emitOpError("cross-rank DMA with explicit offsets/sizes/strides "
+                        "is not yet supported");
+        signalPassFailure();
+        return;
+      }
+
+      auto srcType = cast<MemRefType>(dma.getSrcMemref().getType());
+      auto dstType = cast<MemRefType>(dma.getDstMemref().getType());
+      if (srcType.getMemorySpaceAsInt() != 0 ||
+          dstType.getMemorySpaceAsInt() != 0) {
+        dma.emitOpError(
+            "cross-rank DMA requires both memrefs in memory_space=0");
+        signalPassFailure();
+        return;
+      }
+
+      // Determine which side has the rank attribute. (Only one is supported
+      // per op for now.)
+      bool srcIsPeer = dma.getSrcRank().has_value();
+      bool dstIsPeer = dma.getDstRank().has_value();
+      if (srcIsPeer && dstIsPeer) {
+        dma.emitOpError(
+            "cross-rank DMA with both src_rank and dst_rank set is not yet "
+            "supported");
+        signalPassFailure();
+        return;
+      }
+      int64_t peerRank =
+          srcIsPeer ? *dma.getSrcRank() : *dma.getDstRank();
+      auto peerSideType = srcIsPeer ? srcType : dstType;
+      Value peerMemref = srcIsPeer ? dma.getSrcMemref() : dma.getDstMemref();
+      Value localMemref =
+          srcIsPeer ? dma.getDstMemref() : dma.getSrcMemref();
+
+      builder.setInsertionPoint(dma);
+      Value sizeBytes = computeMemrefByteSize(builder, loc, peerSideType);
+      if (!sizeBytes) {
+        dma.emitOpError("cross-rank DMA requires static memref shape with "
+                        "byte-aligned element type");
+        signalPassFailure();
+        return;
+      }
+      Value nullPtr = LLVM::ZeroOp::create(builder, loc, ptrTy);
+
+      Value peerLocalPtr = extractAlignedPtr(builder, loc, peerMemref);
+      Value localPtr = extractAlignedPtr(builder, loc, localMemref);
+
+      // bases = mgpuGetHeapBases()
+      Value bases = func::CallOp::create(builder, loc, getBasesFn, ValueRange{})
+                       .getResult(0);
+
+      // my_rank = mgpuGetRank() (i32 -> i64)
+      Value myRankI32 =
+          func::CallOp::create(builder, loc, getRankFn, ValueRange{})
+              .getResult(0);
+      Value myRankI64 = arith::ExtSIOp::create(builder, loc, i64Ty, myRankI32);
+
+      // my_base = bases[my_rank]
+      Value myBaseAddr = LLVM::GEPOp::create(builder, loc, ptrTy, ptrTy, bases,
+                                              ArrayRef<Value>{myRankI64});
+      Value myBase = LLVM::LoadOp::create(builder, loc, ptrTy, myBaseAddr);
+
+      // peer_base = bases[<peerRank>]
+      Value peerRankIdx = LLVM::ConstantOp::create(
+          builder, loc, i64Ty, builder.getI64IntegerAttr(peerRank));
+      Value peerBaseAddr = LLVM::GEPOp::create(
+          builder, loc, ptrTy, ptrTy, bases, ArrayRef<Value>{peerRankIdx});
+      Value peerBase = LLVM::LoadOp::create(builder, loc, ptrTy, peerBaseAddr);
+
+      // offset = peerLocalPtr (as i64) - my_base (as i64)
+      Value peerLocalInt =
+          LLVM::PtrToIntOp::create(builder, loc, i64Ty, peerLocalPtr);
+      Value myBaseInt = LLVM::PtrToIntOp::create(builder, loc, i64Ty, myBase);
+      Value offset =
+          arith::SubIOp::create(builder, loc, peerLocalInt, myBaseInt);
+
+      // peer_ptr = peer_base + offset (byte-stride GEP)
+      auto i8Ty = builder.getI8Type();
+      Value peerPtr = LLVM::GEPOp::create(builder, loc, ptrTy, i8Ty, peerBase,
+                                           ArrayRef<Value>{offset});
+
+      // mgpuMemcpy(dst, src, size, nullptr) — substitute peerPtr on the
+      // peer side.
+      Value srcArg = srcIsPeer ? peerPtr : localPtr;
+      Value dstArg = dstIsPeer ? peerPtr : localPtr;
+      func::CallOp::create(builder, loc, memcpyFn,
+                            ValueRange{dstArg, srcArg, sizeBytes, nullPtr});
+
+      // If this DMA returned an async token, replace it with a wait_all.
+      if (dma.getAsyncToken()) {
+        Value tok = air::WaitAllOp::create(
+                         builder, loc,
+                         air::AsyncTokenType::get(builder.getContext()),
+                         ValueRange{})
+                        .getAsyncToken();
+        dma.getAsyncToken().replaceAllUsesWith(tok);
+      }
+      dma.erase();
+    }
+  }
+};
+
+} // namespace
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRCrossRankDmaToMgpuPass() {
+  return std::make_unique<AIRCrossRankDmaToMgpuPass>();
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
@@ -59,6 +59,7 @@ if(AIR_ENABLE_GPU)
     GPUKernelOutlinePass.cpp
     AIRRankToMgpuPass.cpp
     AIRSymmetricAllocToMgpuPass.cpp
+    AIRCrossRankDmaToMgpuPass.cpp
   )
   list(APPEND CONVERSION_LINK_LIBS
     MLIRGPUDialect
diff --git a/mlir/lib/Conversion/Passes.cpp b/mlir/lib/Conversion/Passes.cpp
@@ -9,6 +9,7 @@
 #include "air/Conversion/Passes.h"
 
 #if AIR_ENABLE_GPU
+#include "air/Conversion/AIRCrossRankDmaToMgpuPass.h"
 #include "air/Conversion/AIRRankToMgpuPass.h"
 #include "air/Conversion/AIRSymmetricAllocToMgpuPass.h"
 #include "air/Conversion/AIRToROCDLPass.h"
diff --git a/mlir/test/Conversion/AIRCrossRankDmaToMgpu/cross_rank_dma.mlir b/mlir/test/Conversion/AIRCrossRankDmaToMgpu/cross_rank_dma.mlir
diff --git a/test/gpu/symmetric_heap_dma/air_sym_with_dma.mlir b/test/gpu/symmetric_heap_dma/air_sym_with_dma.mlir
diff --git a/test/gpu/symmetric_heap_dma/run.sh b/test/gpu/symmetric_heap_dma/run.sh

Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,7 @@ if(AIR_ENABLE_GPU)`
`59`	`59`	`GPUKernelOutlinePass.cpp`
`60`	`60`	`AIRRankToMgpuPass.cpp`
`61`	`61`	`AIRSymmetricAllocToMgpuPass.cpp`
	`62`	`+ AIRCrossRankDmaToMgpuPass.cpp`
`62`	`63`	`)`
`63`	`64`	`list(APPEND CONVERSION_LINK_LIBS`
`64`	`65`	`MLIRGPUDialect`