Xilinx · erwei-xilinx · May 3, 2026 · May 5, 2026 · May 5, 2026 · May 6, 2026
@@ -0,0 +1,22 @@
+//===- AIRCrossRankDmaToMgpuPass.h ------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H
+#define AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRCrossRankDmaToMgpuPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H
@@ -0,0 +1,22 @@
+//===- AIRGpuChannelToMgpuPass.h --------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H
+#define AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRGpuChannelToMgpuPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H
@@ -0,0 +1,22 @@
+//===- AIRRankToMgpuPass.h ---------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H
+#define AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRRankToMgpuPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H
@@ -0,0 +1,22 @@
+//===- AIRSymmetricAllocToMgpuPass.h ----------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H
+#define AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRSymmetricAllocToMgpuPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H
@@ -0,0 +1,22 @@
+//===- AIRTranslateToLLVMPass.h --------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H
+#define AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRTranslateToLLVMPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H
@@ -23,8 +23,13 @@ namespace air {
 using namespace mlir;
 
 #define GEN_PASS_DECL
+#define GEN_PASS_DEF_AIRTRANSLATETOLLVM
 #define GEN_PASS_DEF_CONVERTAIRTOROCDL
 #define GEN_PASS_DEF_CONVERTGPUKERNELOUTLINE
+#define GEN_PASS_DEF_AIRRANKTOMGPU
+#define GEN_PASS_DEF_AIRSYMMETRICALLOCTOMGPU
+#define GEN_PASS_DEF_AIRCROSSRANKDMATOMGPU
+#define GEN_PASS_DEF_AIRGPUCHANNELTOMGPU
 #include "air/Conversion/GPUPasses.h.inc"
 
 } // namespace air

@@ -21,6 +21,23 @@ def ConvertAIRToROCDL : Pass<"air-to-rocdl", "ModuleOp"> {
   let options = [];
 }
 
+def AIRTranslateToLLVM : Pass<"air-translate-to-llvm", "ModuleOp"> {
+  let summary = "Lower air.translate to memref.reinterpret_cast + LLVM-dialect address arithmetic";
+  let description = [{
+    Expands each `air.translate` op into the pointer-rebase computation:
+    `bases[to_rank] - bases[from_rank]`, converted from bytes to elements
+    of the source memref's element type, then applied as a new offset
+    via `memref.reinterpret_cast`. The expansion is pure arithmetic; it
+    works identically on host functions and inside `gpu.func`.
+  }];
+  let constructor = "xilinx::air::createAIRTranslateToLLVMPass()";
+  let dependentDialects = [
+    "mlir::arith::ArithDialect",
+    "mlir::memref::MemRefDialect",
+    "mlir::LLVM::LLVMDialect"
+  ];
+}
+
 def ConvertGPUKernelOutline : Pass<"air-gpu-outlining", "ModuleOp"> {
   let summary = "Outline GPU Kernel Func from GPU Launch";
   let constructor = "xilinx::air::createGPUKernelOutlinePass()";
@@ -32,4 +49,107 @@ def ConvertGPUKernelOutline : Pass<"air-gpu-outlining", "ModuleOp"> {
   let options = [];
 }
 
+def AIRGpuChannelToMgpu : Pass<"air-gpu-channel-to-mgpu", "ModuleOp"> {
+  let summary = "Lower air.channel.put/get of channel_type=\"gpu_symmetric_heap\" "
+                "to host-side mgpuMemcpy (peer-VA) + mgpuBarrier";
+  let constructor = "xilinx::air::createAIRGpuChannelToMgpuPass()";
+  let description = [{
+    For each `air.channel @C [...] {channel_type = "gpu_symmetric_heap"}`,
+    pair its single `air.channel.put` and single `air.channel.get`. The put
+    becomes `mgpuBarrier()` (publish: data is already in the symmetric heap
+    via the put's `air.symmetric` source memref). The get becomes
+    `mgpuBarrier()` followed by `mgpuMemcpy(dst, peer_va(put_src), size)`
+    where the peer rank is the get's first index operand and the peer VA is
+    computed via `mgpuGetHeapBases()`.
+
+    Restrictions in this initial version:
+      - One put and one get per channel symbol.
+      - Both put and get at host scope (no `gpu.launch`/`gpu.func`).
+      - put's source memref must be `air.symmetric`-tagged.
+      - get's destination memref must be in `memory_space=0`.
+      - "Entire memref" form only on both sides.
+      - get must take exactly one index operand (the peer rank).
+  }];
+  let dependentDialects = [
+    "func::FuncDialect", "arith::ArithDialect", "memref::MemRefDialect",
+    "LLVM::LLVMDialect"
+  ];
+}
+
+def AIRCrossRankDmaToMgpu : Pass<"air-cross-rank-dma-to-mgpu", "ModuleOp"> {
+  let summary = "Lower air.dma_memcpy_nd with src_rank/dst_rank to mgpuMemcpy "
+                "with peer-VA addressing through mgpuGetHeapBases()";
+  let constructor = "xilinx::air::createAIRCrossRankDmaToMgpuPass()";
+  let description = [{
+    For each `air.dma_memcpy_nd` op carrying a `src_rank` or `dst_rank`
+    integer attribute, emit a host-side `mgpuMemcpy` whose peer-side pointer
+    is computed as `mgpuGetHeapBases()[peer] + (local_ptr - local_base)`.
+
+    Restrictions in this initial version:
+      - Both `src` and `dst` memrefs must be in `memory_space=0`.
+      - The op must be at host scope (not inside any `gpu.launch`/`gpu.func`).
+      - "Entire memref" form only: `[]` `[]` `[]` for both sides — no
+        custom offsets / sizes / strides.
+
+    Lower this pass *before* `air-symmetric-alloc-to-mgpu` so that pointer
+    extraction (`memref.extract_aligned_pointer_as_index`) sees plain
+    memrefs rather than already-cast LLVM struct values.
+  }];
+  let dependentDialects = [
+    "func::FuncDialect", "arith::ArithDialect", "memref::MemRefDialect",
+    "LLVM::LLVMDialect"
+  ];
+}
+
+def AIRSymmetricAllocToMgpu : Pass<"air-symmetric-alloc-to-mgpu", "ModuleOp"> {
+  let summary = "Lower memref.alloc {air.symmetric} to mgpuSymmetricAlloc and "
+                "memref.dealloc of the result to mgpuSymmetricFree";
+  let constructor = "xilinx::air::createAIRSymmetricAllocToMgpuPass()";
+  let description = [{
+    Replaces each `memref.alloc` carrying the unit attribute `air.symmetric`
+    with a call to `mgpuSymmetricAlloc(size_in_bytes, stream)` returning
+    `!llvm.ptr`, then builds an LLVM memref descriptor (struct) wrapping that
+    pointer and projects it back to the original memref type via
+    `builtin.unrealized_conversion_cast` so downstream uses keep working.
+
+    For every `memref.dealloc` whose operand traces back (through a single
+    `unrealized_conversion_cast`) to such a symmetric alloc, the pass emits
+    `mgpuSymmetricFree(ptr, stream)` and erases the dealloc.
+
+    Should run before `convert-to-llvm`. Does nothing if no `air.symmetric`
+    allocations are present.
+  }];
+  let dependentDialects = [
+    "func::FuncDialect", "arith::ArithDialect", "LLVM::LLVMDialect"
+  ];
+}
+
+def AIRRankToMgpu : Pass<"air-rank-to-mgpu", "ModuleOp"> {
+  let summary = "Lower air.rank to mgpu* runtime calls (multi-GPU process model)";
+  let constructor = "xilinx::air::createAIRRankToMgpuPass()";
+  let description = [{
+    Each `air.rank` op is replaced by inlining its body in place, with rank
+    IDs computed from `mgpuGetRank()` (delinearized into the rank's N-D
+    iteration space) and rank sizes substituted from the static size operands.
+
+    The pass also inserts `mgpuSymmetricHeapInit(heap_size)` at the entry of
+    the enclosing `func.func` (default 256 MB; configurable via the
+    `heap-size` option) and `mgpuSymmetricHeapDestroy()` before each
+    `func.return` in that function.
+
+    This replaces `air-rank-to-launch` for the GPU pipeline. Unlike
+    `air-rank-to-launch` (which serializes ranks via `scf.for`), this pass
+    assumes each process executes the whole rank body once and runtime
+    coordinates across processes via env vars (RANK / WORLD_SIZE / LOCAL_RANK)
+    and the symmetric-heap fabric.
+  }];
+  let options = [
+    Option<"heapSize", "heap-size", "uint64_t", "/*default=*/268435456",
+           "Symmetric heap size in bytes (default: 256 MB)">
+  ];
+  let dependentDialects = [
+    "func::FuncDialect", "arith::ArithDialect"
+  ];
+}
+
 #endif // AIR_CONVERSION_GPU_PASSES
@@ -926,6 +926,43 @@ def air_ExecuteTerminatorOp : air_Op<"execute_terminator", [HasParent<"ExecuteOp
       [{  attr-dict ($results^ `:` type($results))? }];
 }
 
+def air_TranslateOp : air_Op<"translate",
+                              [Pure, AllTypesMatch<["source", "result"]>]>,
+                       Arguments<(ins AnyMemRef:$source,
+                                      Index:$from_rank,
+                                      Index:$to_rank,
+                                      MemRefRankOf<[Index], [1]>:$heap_bases)>,
+                       Results<(outs AnyMemRef:$result)> {
+  let summary = "Re-express a symmetric-heap memref in another rank's address space";
+  let description = [{
+    Produces a memref of the same type as `$source` whose underlying
+    pointer references the corresponding allocation on `$to_rank`. The
+    `$source` memref is assumed to live on `$from_rank`'s symmetric heap.
+    The translation is the pointer rebase
+
+        peer_va = bases[to_rank] + (source_ptr - bases[from_rank])
+
+    where `$heap_bases` is a 1-D memref of `index`-typed pointer values
+    (per-rank symmetric-heap base addresses) obtained from the
+    `mgpuGetHeapBases()` runtime hook. The host typically wraps the raw
+    runtime pointer as a `memref<?xindex>` once and threads it through
+    `gpu.launch_func` as a kernel argument. No data is moved; this op
+    produces a value-level "view" of peer memory.
+
+    Folds to `$source` when `$from_rank` and `$to_rank` are statically
+    equal.
+
+    Both ranks must address the same collective allocation on the
+    symmetric heap (i.e. `$source` must trace back to a
+    `memref.alloc {air.symmetric}`). Using this op outside that contract
+    is undefined.
+  }];
+  let assemblyFormat =
+      [{ $source `,` $from_rank `,` $to_rank `,` $heap_bases
+         attr-dict `:` type($source) `,` type($heap_bases) }];
+  let hasFolder = 1;
+}
+
 // AIR custom op, as a handle for a user-provided AIE kernel
 
 def air_CustomOp : air_Op<"custom", [air_AsyncOpInterface,