diff --git a/mlir/include/air/Conversion/AIRCrossRankDmaToMgpuPass.h b/mlir/include/air/Conversion/AIRCrossRankDmaToMgpuPass.h
new file mode 100644
index 000000000..f3b55cad3
--- /dev/null
+++ b/mlir/include/air/Conversion/AIRCrossRankDmaToMgpuPass.h
@@ -0,0 +1,22 @@
+//===- AIRCrossRankDmaToMgpuPass.h ------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H
+#define AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRCrossRankDmaToMgpuPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H
diff --git a/mlir/include/air/Conversion/AIRGpuChannelToMgpuPass.h b/mlir/include/air/Conversion/AIRGpuChannelToMgpuPass.h
new file mode 100644
index 000000000..2c9cae589
--- /dev/null
+++ b/mlir/include/air/Conversion/AIRGpuChannelToMgpuPass.h
@@ -0,0 +1,22 @@
+//===- AIRGpuChannelToMgpuPass.h --------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H
+#define AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRGpuChannelToMgpuPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H
diff --git a/mlir/include/air/Conversion/AIRRankToMgpuPass.h b/mlir/include/air/Conversion/AIRRankToMgpuPass.h
new file mode 100644
index 000000000..cd19021bd
--- /dev/null
+++ b/mlir/include/air/Conversion/AIRRankToMgpuPass.h
@@ -0,0 +1,22 @@
+//===- AIRRankToMgpuPass.h ---------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H
+#define AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRRankToMgpuPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H
diff --git a/mlir/include/air/Conversion/AIRSymmetricAllocToMgpuPass.h b/mlir/include/air/Conversion/AIRSymmetricAllocToMgpuPass.h
new file mode 100644
index 000000000..3168dcfbf
--- /dev/null
+++ b/mlir/include/air/Conversion/AIRSymmetricAllocToMgpuPass.h
@@ -0,0 +1,22 @@
+//===- AIRSymmetricAllocToMgpuPass.h ----------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H
+#define AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRSymmetricAllocToMgpuPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H
diff --git a/mlir/include/air/Conversion/AIRTranslateToLLVMPass.h b/mlir/include/air/Conversion/AIRTranslateToLLVMPass.h
new file mode 100644
index 000000000..b07830787
--- /dev/null
+++ b/mlir/include/air/Conversion/AIRTranslateToLLVMPass.h
@@ -0,0 +1,22 @@
+//===- AIRTranslateToLLVMPass.h --------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+#ifndef AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H
+#define AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRTranslateToLLVMPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H
diff --git a/mlir/include/air/Conversion/GPUPassDetail.h b/mlir/include/air/Conversion/GPUPassDetail.h
index bcf944587..4cf0e1ab8 100644
--- a/mlir/include/air/Conversion/GPUPassDetail.h
+++ b/mlir/include/air/Conversion/GPUPassDetail.h
@@ -23,8 +23,13 @@ namespace air {
 using namespace mlir;
 
 #define GEN_PASS_DECL
+#define GEN_PASS_DEF_AIRTRANSLATETOLLVM
 #define GEN_PASS_DEF_CONVERTAIRTOROCDL
 #define GEN_PASS_DEF_CONVERTGPUKERNELOUTLINE
+#define GEN_PASS_DEF_AIRRANKTOMGPU
+#define GEN_PASS_DEF_AIRSYMMETRICALLOCTOMGPU
+#define GEN_PASS_DEF_AIRCROSSRANKDMATOMGPU
+#define GEN_PASS_DEF_AIRGPUCHANNELTOMGPU
 #include "air/Conversion/GPUPasses.h.inc"
 
 } // namespace air
diff --git a/mlir/include/air/Conversion/GPUPasses.td b/mlir/include/air/Conversion/GPUPasses.td
index ae846cf12..056104bc2 100644
--- a/mlir/include/air/Conversion/GPUPasses.td
+++ b/mlir/include/air/Conversion/GPUPasses.td
@@ -21,6 +21,23 @@ def ConvertAIRToROCDL : Pass<"air-to-rocdl", "ModuleOp"> {
   let options = [];
 }
 
+def AIRTranslateToLLVM : Pass<"air-translate-to-llvm", "ModuleOp"> {
+  let summary = "Lower air.translate to memref.reinterpret_cast + LLVM-dialect address arithmetic";
+  let description = [{
+    Expands each `air.translate` op into the pointer-rebase computation:
+    `bases[to_rank] - bases[from_rank]`, converted from bytes to elements
+    of the source memref's element type, then applied as a new offset
+    via `memref.reinterpret_cast`. The expansion is pure arithmetic; it
+    works identically on host functions and inside `gpu.func`.
+  }];
+  let constructor = "xilinx::air::createAIRTranslateToLLVMPass()";
+  let dependentDialects = [
+    "mlir::arith::ArithDialect",
+    "mlir::memref::MemRefDialect",
+    "mlir::LLVM::LLVMDialect"
+  ];
+}
+
 def ConvertGPUKernelOutline : Pass<"air-gpu-outlining", "ModuleOp"> {
   let summary = "Outline GPU Kernel Func from GPU Launch";
   let constructor = "xilinx::air::createGPUKernelOutlinePass()";
@@ -32,4 +49,107 @@ def ConvertGPUKernelOutline : Pass<"air-gpu-outlining", "ModuleOp"> {
   let options = [];
 }
 
+def AIRGpuChannelToMgpu : Pass<"air-gpu-channel-to-mgpu", "ModuleOp"> {
+  let summary = "Lower air.channel.put/get of channel_type=\"gpu_symmetric_heap\" "
+                "to host-side mgpuMemcpy (peer-VA) + mgpuBarrier";
+  let constructor = "xilinx::air::createAIRGpuChannelToMgpuPass()";
+  let description = [{
+    For each `air.channel @C [...] {channel_type = "gpu_symmetric_heap"}`,
+    pair its single `air.channel.put` and single `air.channel.get`. The put
+    becomes `mgpuBarrier()` (publish: data is already in the symmetric heap
+    via the put's `air.symmetric` source memref). The get becomes
+    `mgpuBarrier()` followed by `mgpuMemcpy(dst, peer_va(put_src), size)`
+    where the peer rank is the get's first index operand and the peer VA is
+    computed via `mgpuGetHeapBases()`.
+
+    Restrictions in this initial version:
+      - One put and one get per channel symbol.
+      - Both put and get at host scope (no `gpu.launch`/`gpu.func`).
+      - put's source memref must be `air.symmetric`-tagged.
+      - get's destination memref must be in `memory_space=0`.
+      - "Entire memref" form only on both sides.
+      - get must take exactly one index operand (the peer rank).
+  }];
+  let dependentDialects = [
+    "func::FuncDialect", "arith::ArithDialect", "memref::MemRefDialect",
+    "LLVM::LLVMDialect"
+  ];
+}
+
+def AIRCrossRankDmaToMgpu : Pass<"air-cross-rank-dma-to-mgpu", "ModuleOp"> {
+  let summary = "Lower air.dma_memcpy_nd with src_rank/dst_rank to mgpuMemcpy "
+                "with peer-VA addressing through mgpuGetHeapBases()";
+  let constructor = "xilinx::air::createAIRCrossRankDmaToMgpuPass()";
+  let description = [{
+    For each `air.dma_memcpy_nd` op carrying a `src_rank` or `dst_rank`
+    integer attribute, emit a host-side `mgpuMemcpy` whose peer-side pointer
+    is computed as `mgpuGetHeapBases()[peer] + (local_ptr - local_base)`.
+
+    Restrictions in this initial version:
+      - Both `src` and `dst` memrefs must be in `memory_space=0`.
+      - The op must be at host scope (not inside any `gpu.launch`/`gpu.func`).
+      - "Entire memref" form only: `[]` `[]` `[]` for both sides — no
+        custom offsets / sizes / strides.
+
+    Lower this pass *before* `air-symmetric-alloc-to-mgpu` so that pointer
+    extraction (`memref.extract_aligned_pointer_as_index`) sees plain
+    memrefs rather than already-cast LLVM struct values.
+  }];
+  let dependentDialects = [
+    "func::FuncDialect", "arith::ArithDialect", "memref::MemRefDialect",
+    "LLVM::LLVMDialect"
+  ];
+}
+
+def AIRSymmetricAllocToMgpu : Pass<"air-symmetric-alloc-to-mgpu", "ModuleOp"> {
+  let summary = "Lower memref.alloc {air.symmetric} to mgpuSymmetricAlloc and "
+                "memref.dealloc of the result to mgpuSymmetricFree";
+  let constructor = "xilinx::air::createAIRSymmetricAllocToMgpuPass()";
+  let description = [{
+    Replaces each `memref.alloc` carrying the unit attribute `air.symmetric`
+    with a call to `mgpuSymmetricAlloc(size_in_bytes, stream)` returning
+    `!llvm.ptr`, then builds an LLVM memref descriptor (struct) wrapping that
+    pointer and projects it back to the original memref type via
+    `builtin.unrealized_conversion_cast` so downstream uses keep working.
+
+    For every `memref.dealloc` whose operand traces back (through a single
+    `unrealized_conversion_cast`) to such a symmetric alloc, the pass emits
+    `mgpuSymmetricFree(ptr, stream)` and erases the dealloc.
+
+    Should run before `convert-to-llvm`. Does nothing if no `air.symmetric`
+    allocations are present.
+  }];
+  let dependentDialects = [
+    "func::FuncDialect", "arith::ArithDialect", "LLVM::LLVMDialect"
+  ];
+}
+
+def AIRRankToMgpu : Pass<"air-rank-to-mgpu", "ModuleOp"> {
+  let summary = "Lower air.rank to mgpu* runtime calls (multi-GPU process model)";
+  let constructor = "xilinx::air::createAIRRankToMgpuPass()";
+  let description = [{
+    Each `air.rank` op is replaced by inlining its body in place, with rank
+    IDs computed from `mgpuGetRank()` (delinearized into the rank's N-D
+    iteration space) and rank sizes substituted from the static size operands.
+
+    The pass also inserts `mgpuSymmetricHeapInit(heap_size)` at the entry of
+    the enclosing `func.func` (default 256 MB; configurable via the
+    `heap-size` option) and `mgpuSymmetricHeapDestroy()` before each
+    `func.return` in that function.
+
+    This replaces `air-rank-to-launch` for the GPU pipeline. Unlike
+    `air-rank-to-launch` (which serializes ranks via `scf.for`), this pass
+    assumes each process executes the whole rank body once and runtime
+    coordinates across processes via env vars (RANK / WORLD_SIZE / LOCAL_RANK)
+    and the symmetric-heap fabric.
+  }];
+  let options = [
+    Option<"heapSize", "heap-size", "uint64_t", "/*default=*/268435456",
+           "Symmetric heap size in bytes (default: 256 MB)">
+  ];
+  let dependentDialects = [
+    "func::FuncDialect", "arith::ArithDialect"
+  ];
+}
+
 #endif // AIR_CONVERSION_GPU_PASSES
diff --git a/mlir/include/air/Dialect/AIR/AIR.td b/mlir/include/air/Dialect/AIR/AIR.td
index 19575bb3e..74703144b 100644
--- a/mlir/include/air/Dialect/AIR/AIR.td
+++ b/mlir/include/air/Dialect/AIR/AIR.td
@@ -926,6 +926,43 @@ def air_ExecuteTerminatorOp : air_Op<"execute_terminator", [HasParent<"ExecuteOp
       [{  attr-dict ($results^ `:` type($results))? }];
 }
 
+def air_TranslateOp : air_Op<"translate",
+                              [Pure, AllTypesMatch<["source", "result"]>]>,
+                       Arguments<(ins AnyMemRef:$source,
+                                      Index:$from_rank,
+                                      Index:$to_rank,
+                                      MemRefRankOf<[Index], [1]>:$heap_bases)>,
+                       Results<(outs AnyMemRef:$result)> {
+  let summary = "Re-express a symmetric-heap memref in another rank's address space";
+  let description = [{
+    Produces a memref of the same type as `$source` whose underlying
+    pointer references the corresponding allocation on `$to_rank`. The
+    `$source` memref is assumed to live on `$from_rank`'s symmetric heap.
+    The translation is the pointer rebase
+
+        peer_va = bases[to_rank] + (source_ptr - bases[from_rank])
+
+    where `$heap_bases` is a 1-D memref of `index`-typed pointer values
+    (per-rank symmetric-heap base addresses) obtained from the
+    `mgpuGetHeapBases()` runtime hook. The host typically wraps the raw
+    runtime pointer as a `memref<?xindex>` once and threads it through
+    `gpu.launch_func` as a kernel argument. No data is moved; this op
+    produces a value-level "view" of peer memory.
+
+    Folds to `$source` when `$from_rank` and `$to_rank` are statically
+    equal.
+
+    Both ranks must address the same collective allocation on the
+    symmetric heap (i.e. `$source` must trace back to a
+    `memref.alloc {air.symmetric}`). Using this op outside that contract
+    is undefined.
+  }];
+  let assemblyFormat =
+      [{ $source `,` $from_rank `,` $to_rank `,` $heap_bases
+         attr-dict `:` type($source) `,` type($heap_bases) }];
+  let hasFolder = 1;
+}
+
 // AIR custom op, as a handle for a user-provided AIE kernel
 
 def air_CustomOp : air_Op<"custom", [air_AsyncOpInterface,
diff --git a/mlir/lib/Conversion/AIRCrossRankDmaToMgpuPass.cpp b/mlir/lib/Conversion/AIRCrossRankDmaToMgpuPass.cpp
new file mode 100644
index 000000000..34c7cee99
--- /dev/null
+++ b/mlir/lib/Conversion/AIRCrossRankDmaToMgpuPass.cpp
@@ -0,0 +1,247 @@
+//===- AIRCrossRankDmaToMgpuPass.cpp ---------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// Lower air.dma_memcpy_nd ops carrying a `src_rank` or `dst_rank` integer
+// attribute to host-side mgpuMemcpy calls with peer-VA addressing through
+// mgpuGetHeapBases().
+//
+// Pattern emitted (for src_rank = R):
+//   %size       = arith.constant <bytes> : i64
+//   %nullptr    = llvm.mlir.zero : !llvm.ptr
+//   %dst_ptr    = (extract aligned ptr from %dst memref)
+//   %src_ptr    = (extract aligned ptr from %src memref)
+//   %my_rank    = call @mgpuGetRank() : () -> i32
+//   %bases      = call @mgpuGetHeapBases() : () -> !llvm.ptr
+//   %my_base_at = llvm.getelementptr %bases[%my_rank] : ... -> !llvm.ptr, !llvm.ptr
+//   %my_base    = llvm.load %my_base_at : !llvm.ptr -> !llvm.ptr
+//   %src_int    = llvm.ptrtoint %src_ptr  : !llvm.ptr to i64
+//   %my_base_int = llvm.ptrtoint %my_base : !llvm.ptr to i64
+//   %offset     = arith.subi %src_int, %my_base_int : i64
+//   %peer_base_at = llvm.getelementptr %bases[<R>] : ... -> !llvm.ptr, !llvm.ptr
+//   %peer_base    = llvm.load %peer_base_at : !llvm.ptr -> !llvm.ptr
+//   %peer_src     = llvm.getelementptr %peer_base[%offset] : ... -> !llvm.ptr, i8
+//   call @mgpuMemcpy(%dst_ptr, %peer_src, %size, %nullptr)
+//
+// Initial restrictions:
+//   - Both memrefs must have memory_space=0 (L3/global).
+//   - Op must be at host scope (not inside a gpu.launch / gpu.func).
+//   - "Entire memref" form only: empty offsets/sizes/strides on both sides.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "air/Conversion/AIRCrossRankDmaToMgpuPass.h"
+#include "air/Conversion/GPUPassDetail.h"
+#include "air/Dialect/AIR/AIRDialect.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace xilinx;
+
+namespace {
+
+// Ensure a private extern func declaration exists at module scope.
+static func::FuncOp ensureExternFunc(ModuleOp module, OpBuilder &builder,
+                                     StringRef name, FunctionType type) {
+  if (auto fn = module.lookupSymbol<func::FuncOp>(name))
+    return fn;
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(module.getBody());
+  auto fn = func::FuncOp::create(builder, module.getLoc(), name, type);
+  fn.setPrivate();
+  return fn;
+}
+
+// Compute byte size of a static-shape memref as an i64 SSA value.
+static Value computeMemrefByteSize(OpBuilder &b, Location loc, MemRefType ty) {
+  if (!ty.hasStaticShape())
+    return nullptr;
+  int64_t numElts = 1;
+  for (int64_t d : ty.getShape())
+    numElts *= d;
+  unsigned eltBits = ty.getElementType().getIntOrFloatBitWidth();
+  if (eltBits == 0 || (eltBits % 8) != 0)
+    return nullptr;
+  int64_t totalBytes = numElts * (eltBits / 8);
+  return arith::ConstantOp::create(b, loc, b.getI64Type(),
+                                   b.getI64IntegerAttr(totalBytes));
+}
+
+// Extract an aligned !llvm.ptr from a memref via the standard idiom.
+static Value extractAlignedPtr(OpBuilder &b, Location loc, Value memref) {
+  Value idx = memref::ExtractAlignedPointerAsIndexOp::create(b, loc, memref);
+  Value i64 = arith::IndexCastOp::create(b, loc, b.getI64Type(), idx);
+  auto ptrTy = LLVM::LLVMPointerType::get(b.getContext());
+  return LLVM::IntToPtrOp::create(b, loc, ptrTy, i64);
+}
+
+struct AIRCrossRankDmaToMgpuPass
+    : public xilinx::air::impl::AIRCrossRankDmaToMgpuBase<
+          AIRCrossRankDmaToMgpuPass> {
+
+  AIRCrossRankDmaToMgpuPass() = default;
+  AIRCrossRankDmaToMgpuPass(const AIRCrossRankDmaToMgpuPass &) {}
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    OpBuilder builder(module.getContext());
+    auto i32Ty = builder.getI32Type();
+    auto i64Ty = builder.getI64Type();
+    auto ptrTy = LLVM::LLVMPointerType::get(module.getContext());
+
+    // Collect cross-rank DMA ops.
+    SmallVector<air::DmaMemcpyNdOp> crossRankDmas;
+    module.walk([&](air::DmaMemcpyNdOp op) {
+      if (op.hasCrossRank())
+        crossRankDmas.push_back(op);
+    });
+    if (crossRankDmas.empty())
+      return;
+
+    // Declare the runtime ABI functions we may need.
+    auto getRankFn = ensureExternFunc(module, builder, "mgpuGetRank",
+                                       builder.getFunctionType({}, {i32Ty}));
+    auto getBasesFn =
+        ensureExternFunc(module, builder, "mgpuGetHeapBases",
+                          builder.getFunctionType({}, {ptrTy}));
+    auto memcpyFn = ensureExternFunc(
+        module, builder, "mgpuMemcpy",
+        builder.getFunctionType({ptrTy, ptrTy, i64Ty, ptrTy}, {}));
+
+    for (air::DmaMemcpyNdOp dma : crossRankDmas) {
+      Location loc = dma.getLoc();
+
+      // Restrictions
+      if (dma->getParentOfType<gpu::LaunchOp>() ||
+          dma->getParentOfType<gpu::GPUFuncOp>()) {
+        dma.emitOpError(
+            "cross-rank DMA inside a GPU kernel is not yet supported");
+        signalPassFailure();
+        return;
+      }
+      if (!dma.getSrcOffsets().empty() || !dma.getSrcSizes().empty() ||
+          !dma.getSrcStrides().empty() || !dma.getDstOffsets().empty() ||
+          !dma.getDstSizes().empty() || !dma.getDstStrides().empty()) {
+        dma.emitOpError("cross-rank DMA with explicit offsets/sizes/strides "
+                        "is not yet supported");
+        signalPassFailure();
+        return;
+      }
+
+      auto srcType = cast<MemRefType>(dma.getSrcMemref().getType());
+      auto dstType = cast<MemRefType>(dma.getDstMemref().getType());
+      if (srcType.getMemorySpaceAsInt() != 0 ||
+          dstType.getMemorySpaceAsInt() != 0) {
+        dma.emitOpError(
+            "cross-rank DMA requires both memrefs in memory_space=0");
+        signalPassFailure();
+        return;
+      }
+
+      // Determine which side has the rank attribute. (Only one is supported
+      // per op for now.)
+      bool srcIsPeer = dma.getSrcRank().has_value();
+      bool dstIsPeer = dma.getDstRank().has_value();
+      if (srcIsPeer && dstIsPeer) {
+        dma.emitOpError(
+            "cross-rank DMA with both src_rank and dst_rank set is not yet "
+            "supported");
+        signalPassFailure();
+        return;
+      }
+      int64_t peerRank =
+          srcIsPeer ? *dma.getSrcRank() : *dma.getDstRank();
+      auto peerSideType = srcIsPeer ? srcType : dstType;
+      Value peerMemref = srcIsPeer ? dma.getSrcMemref() : dma.getDstMemref();
+      Value localMemref =
+          srcIsPeer ? dma.getDstMemref() : dma.getSrcMemref();
+
+      builder.setInsertionPoint(dma);
+      Value sizeBytes = computeMemrefByteSize(builder, loc, peerSideType);
+      if (!sizeBytes) {
+        dma.emitOpError("cross-rank DMA requires static memref shape with "
+                        "byte-aligned element type");
+        signalPassFailure();
+        return;
+      }
+      Value nullPtr = LLVM::ZeroOp::create(builder, loc, ptrTy);
+
+      Value peerLocalPtr = extractAlignedPtr(builder, loc, peerMemref);
+      Value localPtr = extractAlignedPtr(builder, loc, localMemref);
+
+      // bases = mgpuGetHeapBases()
+      Value bases = func::CallOp::create(builder, loc, getBasesFn, ValueRange{})
+                       .getResult(0);
+
+      // my_rank = mgpuGetRank() (i32 -> i64)
+      Value myRankI32 =
+          func::CallOp::create(builder, loc, getRankFn, ValueRange{})
+              .getResult(0);
+      Value myRankI64 = arith::ExtSIOp::create(builder, loc, i64Ty, myRankI32);
+
+      // my_base = bases[my_rank]
+      Value myBaseAddr = LLVM::GEPOp::create(builder, loc, ptrTy, ptrTy, bases,
+                                              ArrayRef<Value>{myRankI64});
+      Value myBase = LLVM::LoadOp::create(builder, loc, ptrTy, myBaseAddr);
+
+      // peer_base = bases[<peerRank>]
+      Value peerRankIdx = LLVM::ConstantOp::create(
+          builder, loc, i64Ty, builder.getI64IntegerAttr(peerRank));
+      Value peerBaseAddr = LLVM::GEPOp::create(
+          builder, loc, ptrTy, ptrTy, bases, ArrayRef<Value>{peerRankIdx});
+      Value peerBase = LLVM::LoadOp::create(builder, loc, ptrTy, peerBaseAddr);
+
+      // offset = peerLocalPtr (as i64) - my_base (as i64)
+      Value peerLocalInt =
+          LLVM::PtrToIntOp::create(builder, loc, i64Ty, peerLocalPtr);
+      Value myBaseInt = LLVM::PtrToIntOp::create(builder, loc, i64Ty, myBase);
+      Value offset =
+          arith::SubIOp::create(builder, loc, peerLocalInt, myBaseInt);
+
+      // peer_ptr = peer_base + offset (byte-stride GEP)
+      auto i8Ty = builder.getI8Type();
+      Value peerPtr = LLVM::GEPOp::create(builder, loc, ptrTy, i8Ty, peerBase,
+                                           ArrayRef<Value>{offset});
+
+      // mgpuMemcpy(dst, src, size, nullptr) — substitute peerPtr on the
+      // peer side.
+      Value srcArg = srcIsPeer ? peerPtr : localPtr;
+      Value dstArg = dstIsPeer ? peerPtr : localPtr;
+      func::CallOp::create(builder, loc, memcpyFn,
+                            ValueRange{dstArg, srcArg, sizeBytes, nullPtr});
+
+      // If this DMA returned an async token, replace it with a wait_all.
+      if (dma.getAsyncToken()) {
+        Value tok = air::WaitAllOp::create(
+                         builder, loc,
+                         air::AsyncTokenType::get(builder.getContext()),
+                         ValueRange{})
+                        .getAsyncToken();
+        dma.getAsyncToken().replaceAllUsesWith(tok);
+      }
+      dma.erase();
+    }
+  }
+};
+
+} // namespace
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRCrossRankDmaToMgpuPass() {
+  return std::make_unique<AIRCrossRankDmaToMgpuPass>();
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Conversion/AIRGpuChannelToMgpuPass.cpp b/mlir/lib/Conversion/AIRGpuChannelToMgpuPass.cpp
new file mode 100644
index 000000000..272ff456e
--- /dev/null
+++ b/mlir/lib/Conversion/AIRGpuChannelToMgpuPass.cpp
@@ -0,0 +1,285 @@
+//===- AIRGpuChannelToMgpuPass.cpp ------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// Lower air.channel of channel_type="gpu_symmetric_heap" plus its put/get
+// pair to host-side mgpuMemcpy with peer-VA addressing through
+// mgpuGetHeapBases(), with mgpuBarrier-based synchronization.
+//
+// Per channel:
+//   - put becomes mgpuBarrier() (publish — the data is already in the
+//     symmetric heap via the put's air.symmetric source memref)
+//   - get becomes mgpuBarrier() followed by mgpuMemcpy(dst, peer_va(src), sz)
+//     where the peer rank is the get's first index operand
+//
+//===-----------------------------------------------------------------------===//
+
+#include "air/Conversion/AIRGpuChannelToMgpuPass.h"
+#include "air/Conversion/GPUPassDetail.h"
+#include "air/Dialect/AIR/AIRDialect.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace xilinx;
+
+namespace {
+
+static func::FuncOp ensureExternFunc(ModuleOp module, OpBuilder &builder,
+                                     StringRef name, FunctionType type) {
+  if (auto fn = module.lookupSymbol<func::FuncOp>(name))
+    return fn;
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(module.getBody());
+  auto fn = func::FuncOp::create(builder, module.getLoc(), name, type);
+  fn.setPrivate();
+  return fn;
+}
+
+static Value computeMemrefByteSize(OpBuilder &b, Location loc, MemRefType ty) {
+  if (!ty.hasStaticShape())
+    return nullptr;
+  int64_t numElts = 1;
+  for (int64_t d : ty.getShape())
+    numElts *= d;
+  unsigned eltBits = ty.getElementType().getIntOrFloatBitWidth();
+  if (eltBits == 0 || (eltBits % 8) != 0)
+    return nullptr;
+  int64_t totalBytes = numElts * (eltBits / 8);
+  return arith::ConstantOp::create(b, loc, b.getI64Type(),
+                                   b.getI64IntegerAttr(totalBytes));
+}
+
+static Value extractAlignedPtr(OpBuilder &b, Location loc, Value memref) {
+  Value idx = memref::ExtractAlignedPointerAsIndexOp::create(b, loc, memref);
+  Value i64 = arith::IndexCastOp::create(b, loc, b.getI64Type(), idx);
+  auto ptrTy = LLVM::LLVMPointerType::get(b.getContext());
+  return LLVM::IntToPtrOp::create(b, loc, ptrTy, i64);
+}
+
+struct AIRGpuChannelToMgpuPass
+    : public xilinx::air::impl::AIRGpuChannelToMgpuBase<
+          AIRGpuChannelToMgpuPass> {
+
+  AIRGpuChannelToMgpuPass() = default;
+  AIRGpuChannelToMgpuPass(const AIRGpuChannelToMgpuPass &) {}
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    OpBuilder builder(module.getContext());
+    auto i32Ty = builder.getI32Type();
+    auto i64Ty = builder.getI64Type();
+    auto ptrTy = LLVM::LLVMPointerType::get(module.getContext());
+
+    // Collect gpu_symmetric_heap channel decls and their put/get sites.
+    SmallVector<air::ChannelOp> chans;
+    module.walk([&](air::ChannelOp ch) {
+      if (ch.getChannelType() == "gpu_symmetric_heap")
+        chans.push_back(ch);
+    });
+    if (chans.empty())
+      return;
+
+    auto getRankFn = ensureExternFunc(module, builder, "mgpuGetRank",
+                                       builder.getFunctionType({}, {i32Ty}));
+    auto getBasesFn =
+        ensureExternFunc(module, builder, "mgpuGetHeapBases",
+                          builder.getFunctionType({}, {ptrTy}));
+    auto memcpyFn = ensureExternFunc(
+        module, builder, "mgpuMemcpy",
+        builder.getFunctionType({ptrTy, ptrTy, i64Ty, ptrTy}, {}));
+    auto barrierFn = ensureExternFunc(
+        module, builder, "mgpuBarrier", builder.getFunctionType({}, {}));
+
+    for (air::ChannelOp ch : chans) {
+      StringAttr sym = ch.getSymNameAttr();
+
+      // Find puts and gets that reference this channel symbol.
+      SmallVector<air::ChannelPutOp> puts;
+      SmallVector<air::ChannelGetOp> gets;
+      module.walk([&](air::ChannelPutOp p) {
+        if (p.getChanName() == sym.getValue())
+          puts.push_back(p);
+      });
+      module.walk([&](air::ChannelGetOp g) {
+        if (g.getChanName() == sym.getValue())
+          gets.push_back(g);
+      });
+
+      if (puts.size() != 1 || gets.size() != 1) {
+        ch.emitOpError()
+            << "channel_type=\"gpu_symmetric_heap\" requires exactly one "
+               "put and one get per channel; found "
+            << puts.size() << " put(s), " << gets.size() << " get(s)";
+        signalPassFailure();
+        return;
+      }
+      air::ChannelPutOp put = puts.front();
+      air::ChannelGetOp get = gets.front();
+
+      // Restrictions
+      if (put->getParentOfType<gpu::LaunchOp>() ||
+          put->getParentOfType<gpu::GPUFuncOp>() ||
+          get->getParentOfType<gpu::LaunchOp>() ||
+          get->getParentOfType<gpu::GPUFuncOp>()) {
+        ch.emitOpError("gpu_symmetric_heap put/get inside a GPU kernel is "
+                       "not yet supported");
+        signalPassFailure();
+        return;
+      }
+      if (!put.getSrcOffsets().empty() || !put.getSrcSizes().empty() ||
+          !put.getSrcStrides().empty() || !get.getDstOffsets().empty() ||
+          !get.getDstSizes().empty() || !get.getDstStrides().empty()) {
+        ch.emitOpError("gpu_symmetric_heap put/get with explicit "
+                       "offsets/sizes/strides is not yet supported");
+        signalPassFailure();
+        return;
+      }
+
+      auto srcType = cast<MemRefType>(put.getSrc().getType());
+      auto dstType = cast<MemRefType>(get.getDst().getType());
+      if (srcType.getMemorySpaceAsInt() != 0 ||
+          dstType.getMemorySpaceAsInt() != 0) {
+        ch.emitOpError(
+            "gpu_symmetric_heap put/get requires both memrefs in memory_space=0");
+        signalPassFailure();
+        return;
+      }
+
+      // The put's source must be air.symmetric so peers can read it.
+      if (auto allocOp = put.getSrc().getDefiningOp<memref::AllocOp>())
+        if (!allocOp->hasAttr("air.symmetric")) {
+          ch.emitOpError("gpu_symmetric_heap put requires a memref.alloc "
+                         "carrying the \"air.symmetric\" attribute");
+          signalPassFailure();
+          return;
+        }
+
+      if (get.getIndices().size() != 1) {
+        ch.emitOpError("gpu_symmetric_heap get requires exactly one index "
+                       "operand (the peer rank)");
+        signalPassFailure();
+        return;
+      }
+      Value peerRankIdx = get.getIndices().front();
+
+      // ---- Lower put: emit barrier (publish) and erase ----
+      Location putLoc = put.getLoc();
+      builder.setInsertionPointAfter(put);
+      func::CallOp::create(builder, putLoc, barrierFn, ValueRange{});
+      if (put.getAsyncToken()) {
+        Value tok = air::WaitAllOp::create(
+                         builder, putLoc,
+                         air::AsyncTokenType::get(builder.getContext()),
+                         ValueRange{})
+                        .getAsyncToken();
+        put.getAsyncToken().replaceAllUsesWith(tok);
+      }
+      put.erase();
+
+      // ---- Lower get: barrier + cross-rank mgpuMemcpy(dst, peer_va(src), sz) ----
+      Location getLoc = get.getLoc();
+      builder.setInsertionPoint(get);
+
+      // Barrier (consume)
+      func::CallOp::create(builder, getLoc, barrierFn, ValueRange{});
+
+      Value sizeBytes = computeMemrefByteSize(builder, getLoc, srcType);
+      if (!sizeBytes) {
+        ch.emitOpError("gpu_symmetric_heap requires static memref shape");
+        signalPassFailure();
+        return;
+      }
+      Value nullPtr = LLVM::ZeroOp::create(builder, getLoc, ptrTy);
+
+      Value srcLocalPtr = extractAlignedPtr(builder, getLoc, put.getSrc());
+      Value dstLocalPtr = extractAlignedPtr(builder, getLoc, get.getDst());
+
+      Value bases =
+          func::CallOp::create(builder, getLoc, getBasesFn, ValueRange{})
+              .getResult(0);
+      Value myRankI32 =
+          func::CallOp::create(builder, getLoc, getRankFn, ValueRange{})
+              .getResult(0);
+      Value myRankI64 =
+          arith::ExtSIOp::create(builder, getLoc, i64Ty, myRankI32);
+      Value myBaseAddr = LLVM::GEPOp::create(builder, getLoc, ptrTy, ptrTy,
+                                              bases, ArrayRef<Value>{myRankI64});
+      Value myBase = LLVM::LoadOp::create(builder, getLoc, ptrTy, myBaseAddr);
+
+      // Peer rank: convert dynamic index operand to i64.
+      Value peerRankI64;
+      Type peerTy = peerRankIdx.getType();
+      if (isa<IndexType>(peerTy))
+        peerRankI64 = arith::IndexCastOp::create(builder, getLoc, i64Ty,
+                                                  peerRankIdx);
+      else if (auto intTy = dyn_cast<IntegerType>(peerTy)) {
+        if (intTy.getWidth() == 64)
+          peerRankI64 = peerRankIdx;
+        else
+          peerRankI64 =
+              arith::ExtSIOp::create(builder, getLoc, i64Ty, peerRankIdx);
+      } else {
+        ch.emitOpError("gpu_symmetric_heap get peer-rank index must be index "
+                       "or integer type");
+        signalPassFailure();
+        return;
+      }
+
+      Value peerBaseAddr = LLVM::GEPOp::create(
+          builder, getLoc, ptrTy, ptrTy, bases, ArrayRef<Value>{peerRankI64});
+      Value peerBase =
+          LLVM::LoadOp::create(builder, getLoc, ptrTy, peerBaseAddr);
+
+      Value srcLocalInt =
+          LLVM::PtrToIntOp::create(builder, getLoc, i64Ty, srcLocalPtr);
+      Value myBaseInt =
+          LLVM::PtrToIntOp::create(builder, getLoc, i64Ty, myBase);
+      Value offset =
+          arith::SubIOp::create(builder, getLoc, srcLocalInt, myBaseInt);
+
+      auto i8Ty = builder.getI8Type();
+      Value peerSrc = LLVM::GEPOp::create(builder, getLoc, ptrTy, i8Ty,
+                                           peerBase, ArrayRef<Value>{offset});
+
+      func::CallOp::create(
+          builder, getLoc, memcpyFn,
+          ValueRange{dstLocalPtr, peerSrc, sizeBytes, nullPtr});
+
+      if (get.getAsyncToken()) {
+        Value tok = air::WaitAllOp::create(
+                         builder, getLoc,
+                         air::AsyncTokenType::get(builder.getContext()),
+                         ValueRange{})
+                        .getAsyncToken();
+        get.getAsyncToken().replaceAllUsesWith(tok);
+      }
+      get.erase();
+
+      // The channel symbol can now be erased.
+      ch.erase();
+    }
+  }
+};
+
+} // namespace
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRGpuChannelToMgpuPass() {
+  return std::make_unique<AIRGpuChannelToMgpuPass>();
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Conversion/AIRRankToMgpuPass.cpp b/mlir/lib/Conversion/AIRRankToMgpuPass.cpp
new file mode 100644
index 000000000..654120cbc
--- /dev/null
+++ b/mlir/lib/Conversion/AIRRankToMgpuPass.cpp
@@ -0,0 +1,181 @@
+//===- AIRRankToMgpuPass.cpp -----------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// Lower air.rank to mgpu* runtime calls (multi-GPU process model).
+//
+// Each `air.rank` op is replaced by inlining its body in place, with rank
+// IDs computed from `mgpuGetRank()` (delinearized into the rank's N-D
+// iteration space) and rank sizes substituted from the static size operands.
+//
+// The pass also inserts `mgpuSymmetricHeapInit(heap_size)` at the entry of
+// the enclosing `func.func` and `mgpuSymmetricHeapDestroy()` before each
+// `func.return` in that function.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "air/Conversion/AIRRankToMgpuPass.h"
+#include "air/Conversion/GPUPassDetail.h"
+#include "air/Dialect/AIR/AIRDialect.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace xilinx;
+
+namespace {
+
+// Ensure a private extern func declaration exists at the top of the module.
+static func::FuncOp ensureExternFunc(ModuleOp module, OpBuilder &builder,
+                                     StringRef name, FunctionType type) {
+  if (auto fn = module.lookupSymbol<func::FuncOp>(name))
+    return fn;
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(module.getBody());
+  auto fn = func::FuncOp::create(builder, module.getLoc(), name, type);
+  fn.setPrivate();
+  return fn;
+}
+
+struct AIRRankToMgpuPass
+    : public xilinx::air::impl::AIRRankToMgpuBase<AIRRankToMgpuPass> {
+
+  AIRRankToMgpuPass() = default;
+  AIRRankToMgpuPass(const AIRRankToMgpuPass &pass) {}
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    OpBuilder builder(module.getContext());
+    auto i32Ty = builder.getI32Type();
+    auto i64Ty = builder.getI64Type();
+    auto idxTy = builder.getIndexType();
+
+    // Collect all air.rank ops and their parent functions.
+    SmallVector<air::RankOp> rankOps;
+    SetVector<func::FuncOp> rankParentFuncs;
+    module.walk([&](air::RankOp op) {
+      rankOps.push_back(op);
+      if (auto fn = op->getParentOfType<func::FuncOp>())
+        rankParentFuncs.insert(fn);
+    });
+
+    // If no air.rank ops exist, leave the module untouched.
+    if (rankOps.empty())
+      return;
+
+    // Declare the mgpu* runtime ABI functions (only when needed).
+    auto initFn = ensureExternFunc(module, builder, "mgpuSymmetricHeapInit",
+                                    builder.getFunctionType({i64Ty}, {}));
+    auto destroyFn =
+        ensureExternFunc(module, builder, "mgpuSymmetricHeapDestroy",
+                          builder.getFunctionType({}, {}));
+    auto getRankFn = ensureExternFunc(module, builder, "mgpuGetRank",
+                                       builder.getFunctionType({}, {i32Ty}));
+
+    // For each parent function, insert mgpuSymmetricHeapInit at entry and
+    // mgpuSymmetricHeapDestroy before each return.
+    for (func::FuncOp fn : rankParentFuncs) {
+      if (fn.empty())
+        continue;
+      Block &entry = fn.front();
+      Location loc = fn.getLoc();
+      builder.setInsertionPointToStart(&entry);
+      Value heapSizeVal = arith::ConstantOp::create(
+          builder, loc, i64Ty,
+          builder.getI64IntegerAttr(static_cast<int64_t>(heapSize)));
+      func::CallOp::create(builder, loc, initFn, ValueRange{heapSizeVal});
+
+      // Insert destroy before every return op.
+      SmallVector<func::ReturnOp> returns;
+      fn.walk([&](func::ReturnOp r) { returns.push_back(r); });
+      for (func::ReturnOp r : returns) {
+        builder.setInsertionPoint(r);
+        func::CallOp::create(builder, r.getLoc(), destroyFn, ValueRange{});
+      }
+    }
+
+    // Lower each air.rank op.
+    for (air::RankOp rankOp : rankOps) {
+      builder.setInsertionPoint(rankOp);
+      Location loc = rankOp.getLoc();
+
+      // If the rank has async dependencies, insert a blocking wait before
+      // proceeding.
+      if (!rankOp.getAsyncDependencies().empty()) {
+        air::WaitAllOp::create(builder, loc, Type{},
+                                rankOp.getAsyncDependencies());
+      }
+
+      // Get the flat rank id from mgpuGetRank() and convert to index.
+      Value rankI32 =
+          func::CallOp::create(builder, loc, getRankFn, ValueRange{})
+              .getResult(0);
+      Value rankI64 =
+          arith::ExtSIOp::create(builder, loc, i64Ty, rankI32);
+      Value flatRank =
+          arith::IndexCastOp::create(builder, loc, idxTy, rankI64);
+
+      // Delinearize flatRank into N rank IDs using the static size operands.
+      // For sizes [s0, s1, ..., sn-1]:
+      //   id[0]   = flat % s0
+      //   id[1]   = (flat / s0) % s1
+      //   ...
+      //   id[n-1] = flat / (s0 * s1 * ... * sn-2)
+      auto sizeOpers = rankOp.getSizeOperands();
+      unsigned n = rankOp.getNumDims();
+      SmallVector<Value> ids(n);
+      Value remaining = flatRank;
+      for (unsigned d = 0; d < n; ++d) {
+        if (d == n - 1) {
+          ids[d] = remaining;
+        } else {
+          ids[d] = arith::RemSIOp::create(builder, loc, remaining, sizeOpers[d]);
+          remaining =
+              arith::DivSIOp::create(builder, loc, remaining, sizeOpers[d]);
+        }
+      }
+
+      // Build remap and clone the body.
+      IRMapping remap;
+      for (unsigned d = 0; d < n; ++d) {
+        remap.map(rankOp.getIds()[d], ids[d]);
+        remap.map(rankOp.getSize()[d], sizeOpers[d]);
+      }
+      for (unsigned i = 0; i < rankOp.getNumKernelOperands(); ++i)
+        remap.map(rankOp.getKernelArgument(i), rankOp.getKernelOperand(i));
+
+      auto &ops = rankOp.getBody().front().getOperations();
+      for (auto oi = ops.begin(), oe = --ops.end(); oi != oe; ++oi)
+        builder.clone(*oi, remap);
+
+      // Replace the async token (if any) with a synchronous wait_all.
+      if (rankOp.getAsyncToken()) {
+        auto waitAll = air::WaitAllOp::create(
+            builder, loc, air::AsyncTokenType::get(builder.getContext()),
+            ValueRange{});
+        rankOp.getAsyncToken().replaceAllUsesWith(waitAll.getAsyncToken());
+      }
+
+      rankOp.erase();
+    }
+  }
+};
+
+} // namespace
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRRankToMgpuPass() {
+  return std::make_unique<AIRRankToMgpuPass>();
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Conversion/AIRSymmetricAllocToMgpuPass.cpp b/mlir/lib/Conversion/AIRSymmetricAllocToMgpuPass.cpp
new file mode 100644
index 000000000..864f89c1c
--- /dev/null
+++ b/mlir/lib/Conversion/AIRSymmetricAllocToMgpuPass.cpp
@@ -0,0 +1,199 @@
+//===- AIRSymmetricAllocToMgpuPass.cpp -------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// Lower memref.alloc carrying the `air.symmetric` attribute to a call to the
+// runtime function `mgpuSymmetricAlloc`. The returned `!llvm.ptr` is wrapped
+// in an LLVM memref descriptor (struct) and projected back to the original
+// memref type via `builtin.unrealized_conversion_cast` so that downstream
+// uses keep working.
+//
+// `memref.dealloc` ops whose operand traces (through a single
+// `unrealized_conversion_cast`) back to a symmetric alloc are rewritten to
+// `mgpuSymmetricFree`.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "air/Conversion/AIRSymmetricAllocToMgpuPass.h"
+#include "air/Conversion/GPUPassDetail.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace xilinx;
+
+namespace {
+
+// Ensure a private extern func declaration exists at module scope.
+static func::FuncOp ensureExternFunc(ModuleOp module, OpBuilder &builder,
+                                     StringRef name, FunctionType type) {
+  if (auto fn = module.lookupSymbol<func::FuncOp>(name))
+    return fn;
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(module.getBody());
+  auto fn = func::FuncOp::create(builder, module.getLoc(), name, type);
+  fn.setPrivate();
+  return fn;
+}
+
+// Compute the byte size of a static-shaped memref as an i64 SSA value.
+// Returns nullptr if the memref is dynamically shaped.
+static Value computeMemrefByteSize(OpBuilder &b, Location loc, MemRefType ty) {
+  if (!ty.hasStaticShape())
+    return nullptr;
+  int64_t numElts = 1;
+  for (int64_t d : ty.getShape())
+    numElts *= d;
+  unsigned eltBits = ty.getElementType().getIntOrFloatBitWidth();
+  if (eltBits == 0 || (eltBits % 8) != 0)
+    return nullptr;
+  int64_t totalBytes = numElts * (eltBits / 8);
+  return arith::ConstantOp::create(b, loc, b.getI64Type(),
+                                   b.getI64IntegerAttr(totalBytes));
+}
+
+// Build an LLVM memref descriptor struct populated with the given pointer.
+// For now we support only static-shape, contiguous, identity-layout memrefs
+// without an offset. For dimensions: sizes from the type, strides as
+// row-major (innermost stride = 1).
+static Value buildMemrefDescriptor(OpBuilder &b, Location loc,
+                                   MemRefType memrefTy, Value ptr) {
+  ArrayRef<int64_t> shape = memrefTy.getShape();
+  unsigned rank = shape.size();
+  auto i64Ty = b.getI64Type();
+  auto ptrTy = LLVM::LLVMPointerType::get(b.getContext());
+
+  // Build the descriptor type: !llvm.struct<(ptr, ptr, i64, array<R x i64>,
+  // array<R x i64>)>. For rank-0 memrefs, MLIR omits the size/stride arrays.
+  SmallVector<Type, 5> descFields;
+  descFields.push_back(ptrTy);
+  descFields.push_back(ptrTy);
+  descFields.push_back(i64Ty);
+  if (rank > 0) {
+    descFields.push_back(LLVM::LLVMArrayType::get(i64Ty, rank));
+    descFields.push_back(LLVM::LLVMArrayType::get(i64Ty, rank));
+  }
+  auto structTy = LLVM::LLVMStructType::getLiteral(b.getContext(), descFields);
+
+  Value desc = LLVM::PoisonOp::create(b, loc, structTy);
+  desc = LLVM::InsertValueOp::create(b, loc, desc, ptr, ArrayRef<int64_t>{0});
+  desc = LLVM::InsertValueOp::create(b, loc, desc, ptr, ArrayRef<int64_t>{1});
+  Value zero = LLVM::ConstantOp::create(b, loc, i64Ty, b.getI64IntegerAttr(0));
+  desc = LLVM::InsertValueOp::create(b, loc, desc, zero, ArrayRef<int64_t>{2});
+
+  if (rank > 0) {
+    // Compute row-major strides from shape (innermost = 1).
+    SmallVector<int64_t> strides(rank, 1);
+    for (int i = static_cast<int>(rank) - 2; i >= 0; --i)
+      strides[i] = strides[i + 1] * shape[i + 1];
+    for (unsigned i = 0; i < rank; ++i) {
+      Value sz = LLVM::ConstantOp::create(b, loc, i64Ty,
+                                          b.getI64IntegerAttr(shape[i]));
+      desc = LLVM::InsertValueOp::create(b, loc, desc, sz,
+                                         ArrayRef<int64_t>{3, (int64_t)i});
+      Value st = LLVM::ConstantOp::create(b, loc, i64Ty,
+                                          b.getI64IntegerAttr(strides[i]));
+      desc = LLVM::InsertValueOp::create(b, loc, desc, st,
+                                         ArrayRef<int64_t>{4, (int64_t)i});
+    }
+  }
+  return desc;
+}
+
+struct AIRSymmetricAllocToMgpuPass
+    : public xilinx::air::impl::AIRSymmetricAllocToMgpuBase<
+          AIRSymmetricAllocToMgpuPass> {
+
+  AIRSymmetricAllocToMgpuPass() = default;
+  AIRSymmetricAllocToMgpuPass(const AIRSymmetricAllocToMgpuPass &) {}
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    OpBuilder builder(module.getContext());
+    auto i64Ty = builder.getI64Type();
+    auto ptrTy = LLVM::LLVMPointerType::get(module.getContext());
+
+    // Collect symmetric allocs.
+    SmallVector<memref::AllocOp> symAllocs;
+    module.walk([&](memref::AllocOp op) {
+      if (op->hasAttr("air.symmetric"))
+        symAllocs.push_back(op);
+    });
+
+    if (symAllocs.empty())
+      return;
+
+    auto allocFn = ensureExternFunc(
+        module, builder, "mgpuSymmetricAlloc",
+        builder.getFunctionType({i64Ty, ptrTy}, {ptrTy}));
+    auto freeFn = ensureExternFunc(
+        module, builder, "mgpuSymmetricFree",
+        builder.getFunctionType({ptrTy, ptrTy}, {}));
+
+    // Track the !llvm.ptr backing each lowered memref so deallocs can look
+    // them up.
+    DenseMap<Value, Value> symmetricMemrefToPtr;
+
+    for (memref::AllocOp alloc : symAllocs) {
+      auto memrefTy = alloc.getType();
+      Location loc = alloc.getLoc();
+      builder.setInsertionPoint(alloc);
+
+      Value sizeBytes = computeMemrefByteSize(builder, loc, memrefTy);
+      if (!sizeBytes) {
+        alloc.emitOpError(
+            "air.symmetric memref.alloc requires a static-shape memref with "
+            "byte-aligned element type");
+        signalPassFailure();
+        return;
+      }
+      Value nullPtr = LLVM::ZeroOp::create(builder, loc, ptrTy);
+      Value ptr = func::CallOp::create(builder, loc, allocFn,
+                                        ValueRange{sizeBytes, nullPtr})
+                       .getResult(0);
+
+      Value desc = buildMemrefDescriptor(builder, loc, memrefTy, ptr);
+      Value newMemref = UnrealizedConversionCastOp::create(
+                            builder, loc, TypeRange{memrefTy}, ValueRange{desc})
+                            .getResult(0);
+      symmetricMemrefToPtr[newMemref] = ptr;
+      alloc.getResult().replaceAllUsesWith(newMemref);
+      alloc.erase();
+    }
+
+    // Lower deallocs whose operand traces back to a symmetric alloc.
+    SmallVector<memref::DeallocOp> deallocs;
+    module.walk([&](memref::DeallocOp op) { deallocs.push_back(op); });
+    for (memref::DeallocOp d : deallocs) {
+      Value src = d.getMemref();
+      auto it = symmetricMemrefToPtr.find(src);
+      if (it == symmetricMemrefToPtr.end())
+        continue; // not a symmetric memref
+      builder.setInsertionPoint(d);
+      Value nullPtr = LLVM::ZeroOp::create(builder, d.getLoc(), ptrTy);
+      func::CallOp::create(builder, d.getLoc(), freeFn,
+                            ValueRange{it->second, nullPtr});
+      d.erase();
+    }
+  }
+};
+
+} // namespace
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRSymmetricAllocToMgpuPass() {
+  return std::make_unique<AIRSymmetricAllocToMgpuPass>();
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Conversion/AIRTranslateToLLVMPass.cpp b/mlir/lib/Conversion/AIRTranslateToLLVMPass.cpp
new file mode 100644
index 000000000..eeae715b6
--- /dev/null
+++ b/mlir/lib/Conversion/AIRTranslateToLLVMPass.cpp
@@ -0,0 +1,175 @@
+//===- AIRTranslateToLLVMPass.cpp -------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// Lower air.translate to memref-descriptor construction over a peer-rebased
+// pointer.
+//
+// For each `air.translate %src, %from, %to, %bases`:
+//   1. Extract the source memref's aligned pointer as `index`.
+//   2. Read per-rank base addresses from the heap_bases memref:
+//          from_base = bases[from]
+//          to_base   = bases[to]
+//      via memref.load (each element is an `index` — a pointer-width
+//      integer).
+//   3. Compute the peer aligned index:
+//          peer_aligned = src_aligned + (to_base - from_base)
+//   4. Materialize the peer aligned address as !llvm.ptr (needed only for
+//      the descriptor build below — memref descriptors are LLVM structs).
+//   5. Build a fresh LLVM memref descriptor (poison + insertvalue chain)
+//      whose allocated/aligned pointers both reference the peer address;
+//      offset = 0, sizes/strides come from the source memref's static type.
+//   6. unrealized_conversion_cast the descriptor back to the result memref
+//      type so downstream uses keep working through the standard
+//      memref-to-llvm pipeline.
+//
+// Steps 1-3 use only memref + arith + index ops. The LLVM dialect appears
+// only in steps 4-5 where it is unavoidable (memref descriptors *are* LLVM
+// structs). The lowering is therefore valid both at host scope and inside
+// `gpu.func` — the kernel just needs the heap_bases memref as an argument.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "air/Conversion/AIRTranslateToLLVMPass.h"
+#include "air/Conversion/GPUPassDetail.h"
+#include "air/Dialect/AIR/AIRDialect.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace xilinx;
+
+namespace {
+
+// Build a fresh LLVM memref descriptor for `memrefTy` whose
+// allocated_ptr and aligned_ptr both reference `ptr`, offset is 0, and
+// sizes/strides come from the static type (row-major).
+//
+// Mirrors buildMemrefDescriptor in AIRSymmetricAllocToMgpuPass.
+static Value buildPeerDescriptor(OpBuilder &b, Location loc,
+                                 MemRefType memrefTy, Value ptr) {
+  ArrayRef<int64_t> shape = memrefTy.getShape();
+  unsigned rank = shape.size();
+  auto i64Ty = b.getI64Type();
+  auto ptrTy = LLVM::LLVMPointerType::get(b.getContext());
+
+  SmallVector<Type, 5> descFields;
+  descFields.push_back(ptrTy);
+  descFields.push_back(ptrTy);
+  descFields.push_back(i64Ty);
+  if (rank > 0) {
+    descFields.push_back(LLVM::LLVMArrayType::get(i64Ty, rank));
+    descFields.push_back(LLVM::LLVMArrayType::get(i64Ty, rank));
+  }
+  auto structTy = LLVM::LLVMStructType::getLiteral(b.getContext(), descFields);
+
+  Value desc = LLVM::PoisonOp::create(b, loc, structTy);
+  desc = LLVM::InsertValueOp::create(b, loc, desc, ptr, ArrayRef<int64_t>{0});
+  desc = LLVM::InsertValueOp::create(b, loc, desc, ptr, ArrayRef<int64_t>{1});
+  Value zero = LLVM::ConstantOp::create(b, loc, i64Ty, b.getI64IntegerAttr(0));
+  desc = LLVM::InsertValueOp::create(b, loc, desc, zero, ArrayRef<int64_t>{2});
+
+  if (rank > 0) {
+    SmallVector<int64_t> strides(rank, 1);
+    for (int i = static_cast<int>(rank) - 2; i >= 0; --i)
+      strides[i] = strides[i + 1] * shape[i + 1];
+    for (unsigned i = 0; i < rank; ++i) {
+      Value sz = LLVM::ConstantOp::create(b, loc, i64Ty,
+                                          b.getI64IntegerAttr(shape[i]));
+      desc = LLVM::InsertValueOp::create(b, loc, desc, sz,
+                                         ArrayRef<int64_t>{3, (int64_t)i});
+      Value st = LLVM::ConstantOp::create(b, loc, i64Ty,
+                                          b.getI64IntegerAttr(strides[i]));
+      desc = LLVM::InsertValueOp::create(b, loc, desc, st,
+                                         ArrayRef<int64_t>{4, (int64_t)i});
+    }
+  }
+  return desc;
+}
+
+struct AIRTranslateToLLVMPass
+    : public xilinx::air::impl::AIRTranslateToLLVMBase<AIRTranslateToLLVMPass> {
+
+  AIRTranslateToLLVMPass() = default;
+  AIRTranslateToLLVMPass(const AIRTranslateToLLVMPass &) {}
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    auto *ctx = module.getContext();
+    OpBuilder builder(ctx);
+    auto i64Ty = builder.getI64Type();
+    auto ptrTy = LLVM::LLVMPointerType::get(ctx);
+
+    SmallVector<air::TranslateOp> translates;
+    module.walk([&](air::TranslateOp op) { translates.push_back(op); });
+    if (translates.empty())
+      return;
+
+    for (air::TranslateOp op : translates) {
+      builder.setInsertionPoint(op);
+      Location loc = op.getLoc();
+
+      auto memrefTy = cast<MemRefType>(op.getSource().getType());
+      if (!memrefTy.hasStaticShape()) {
+        op.emitOpError("air.translate requires a static-shape source memref");
+        signalPassFailure();
+        return;
+      }
+
+      // Extract source aligned pointer (as index — pointer-width integer).
+      Value srcAlignedIdx = memref::ExtractAlignedPointerAsIndexOp::create(
+          builder, loc, op.getSource());
+
+      // Load bases[from] / bases[to] as index values. Each element of the
+      // heap_bases memref<?xindex> is a per-rank symmetric-heap base
+      // address stored as a pointer-width integer.
+      Value fromBaseIdx = memref::LoadOp::create(
+          builder, loc, op.getHeapBases(), ValueRange{op.getFromRank()});
+      Value toBaseIdx = memref::LoadOp::create(builder, loc, op.getHeapBases(),
+                                               ValueRange{op.getToRank()});
+
+      // peer_aligned_idx = srcAlignedIdx + (toBaseIdx - fromBaseIdx)
+      Value diffIdx =
+          arith::SubIOp::create(builder, loc, toBaseIdx, fromBaseIdx);
+      Value peerAlignedIdx =
+          arith::AddIOp::create(builder, loc, srcAlignedIdx, diffIdx);
+
+      // Materialize as !llvm.ptr for the descriptor build below (the
+      // descriptor's allocated/aligned-ptr fields are LLVM-typed because
+      // memref descriptors are LLVM structs).
+      Value peerAlignedI64 =
+          arith::IndexCastOp::create(builder, loc, i64Ty, peerAlignedIdx);
+      Value peerAlignedPtr =
+          LLVM::IntToPtrOp::create(builder, loc, ptrTy, peerAlignedI64);
+
+      // Build a fresh memref descriptor with the peer aligned pointer.
+      Value desc = buildPeerDescriptor(builder, loc, memrefTy, peerAlignedPtr);
+      Value newMemref = UnrealizedConversionCastOp::create(
+                            builder, loc, TypeRange{memrefTy}, ValueRange{desc})
+                            .getResult(0);
+
+      op.getResult().replaceAllUsesWith(newMemref);
+      op.erase();
+    }
+  }
+};
+
+} // namespace
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRTranslateToLLVMPass() {
+  return std::make_unique<AIRTranslateToLLVMPass>();
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 46c0101b1..124a2dc6b 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -55,7 +55,12 @@ if(AIR_ENABLE_GPU)
   set(GPU_PASS_DEPENDS AIRGPUConversionPassIncGen)
   list(APPEND CONVERSION_SOURCES
     AIRToROCDLPass.cpp
+    AIRTranslateToLLVMPass.cpp
     GPUKernelOutlinePass.cpp
+    AIRRankToMgpuPass.cpp
+    AIRSymmetricAllocToMgpuPass.cpp
+    AIRCrossRankDmaToMgpuPass.cpp
+    AIRGpuChannelToMgpuPass.cpp
   )
   list(APPEND CONVERSION_LINK_LIBS
     MLIRGPUDialect
diff --git a/mlir/lib/Conversion/Passes.cpp b/mlir/lib/Conversion/Passes.cpp
index b8342da3e..4fb3057f2 100644
--- a/mlir/lib/Conversion/Passes.cpp
+++ b/mlir/lib/Conversion/Passes.cpp
@@ -9,7 +9,12 @@
 #include "air/Conversion/Passes.h"
 
 #if AIR_ENABLE_GPU
+#include "air/Conversion/AIRCrossRankDmaToMgpuPass.h"
+#include "air/Conversion/AIRGpuChannelToMgpuPass.h"
+#include "air/Conversion/AIRRankToMgpuPass.h"
+#include "air/Conversion/AIRSymmetricAllocToMgpuPass.h"
 #include "air/Conversion/AIRToROCDLPass.h"
+#include "air/Conversion/AIRTranslateToLLVMPass.h"
 #include "air/Conversion/GPUKernelOutlinePass.h"
 #endif
 
diff --git a/mlir/lib/Dialect/AIR/IR/AIRDialect.cpp b/mlir/lib/Dialect/AIR/IR/AIRDialect.cpp
index 720d09a7f..eca0c7dd8 100644
--- a/mlir/lib/Dialect/AIR/IR/AIRDialect.cpp
+++ b/mlir/lib/Dialect/AIR/IR/AIRDialect.cpp
@@ -3598,6 +3598,20 @@ ParseResult air::CustomOp::parse(OpAsmParser &parser, OperationState &result) {
   return success();
 }
 
+//
+// TranslateOp
+//
+
+OpFoldResult air::TranslateOp::fold(FoldAdaptor adaptor) {
+  if (getFromRank() == getToRank())
+    return getSource();
+  auto fromAttr = dyn_cast_if_present<IntegerAttr>(adaptor.getFromRank());
+  auto toAttr = dyn_cast_if_present<IntegerAttr>(adaptor.getToRank());
+  if (fromAttr && toAttr && fromAttr.getValue() == toAttr.getValue())
+    return getSource();
+  return {};
+}
+
 } // namespace xilinx
 
 #include "air/Dialect/AIR/AIROpInterfaces.cpp.inc"
diff --git a/mlir/test/Conversion/AIRCrossRankDmaToMgpu/cross_rank_dma.mlir b/mlir/test/Conversion/AIRCrossRankDmaToMgpu/cross_rank_dma.mlir
new file mode 100644
index 000000000..335c2ac5a
--- /dev/null
+++ b/mlir/test/Conversion/AIRCrossRankDmaToMgpu/cross_rank_dma.mlir
@@ -0,0 +1,136 @@
+//===- cross_rank_dma.mlir --------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+// RUN: air-opt %s --split-input-file -air-cross-rank-dma-to-mgpu | FileCheck %s
+
+// Each test wraps the cross-rank dma in air.rank to satisfy the verifier
+// (added in Phase 1) that requires an enclosing air.rank scope.
+
+// Basic src_rank: lower to mgpuMemcpy with peer-VA addressing.
+// CHECK-LABEL: func.func @src_rank
+// CHECK: arith.constant 4096 : i64
+// CHECK: llvm.mlir.zero : !llvm.ptr
+// Extract pointers from both memrefs.
+// CHECK: memref.extract_aligned_pointer_as_index
+// CHECK: memref.extract_aligned_pointer_as_index
+// Get bases and rank.
+// CHECK: call @mgpuGetHeapBases() : () -> !llvm.ptr
+// CHECK: call @mgpuGetRank() : () -> i32
+// CHECK: arith.extsi
+// CHECK: llvm.getelementptr
+// CHECK: llvm.load
+// peer rank constant (0).
+// CHECK: llvm.mlir.constant(0 : i64)
+// CHECK: llvm.getelementptr
+// CHECK: llvm.load
+// offset = peer_local_int - my_base_int.
+// CHECK: llvm.ptrtoint
+// CHECK: llvm.ptrtoint
+// CHECK: arith.subi
+// peer_ptr = peer_base + offset (byte stride GEP).
+// CHECK: llvm.getelementptr {{.*}} -> !llvm.ptr, i8
+// Final memcpy call.
+// CHECK: call @mgpuMemcpy
+// CHECK-NOT: air.dma_memcpy_nd
+func.func @src_rank(%dst: memref<1024xf32>, %src: memref<1024xf32>) {
+  %c2 = arith.constant 2 : index
+  air.rank (%rid) in (%rsize = %c2) args(%d = %dst, %s = %src)
+      : memref<1024xf32>, memref<1024xf32> {
+    air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {src_rank = 0 : i64}
+        : (memref<1024xf32>, memref<1024xf32>)
+    air.rank_terminator
+  }
+  return
+}
+
+// -----
+
+// dst_rank: same lowering pattern, peer pointer becomes the dst arg.
+// CHECK-LABEL: func.func @dst_rank
+// CHECK: call @mgpuMemcpy
+// CHECK-NOT: air.dma_memcpy_nd
+func.func @dst_rank(%dst: memref<1024xf32>, %src: memref<1024xf32>) {
+  %c2 = arith.constant 2 : index
+  air.rank (%rid) in (%rsize = %c2) args(%d = %dst, %s = %src)
+      : memref<1024xf32>, memref<1024xf32> {
+    air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {dst_rank = 1 : i64}
+        : (memref<1024xf32>, memref<1024xf32>)
+    air.rank_terminator
+  }
+  return
+}
+
+// -----
+
+// 2D memref byte size: 64 * 64 * 4 = 16384.
+// CHECK-LABEL: func.func @cross_rank_2d
+// CHECK: arith.constant 16384 : i64
+// CHECK: call @mgpuMemcpy
+func.func @cross_rank_2d(%dst: memref<64x64xf32>, %src: memref<64x64xf32>) {
+  %c2 = arith.constant 2 : index
+  air.rank (%rid) in (%rsize = %c2) args(%d = %dst, %s = %src)
+      : memref<64x64xf32>, memref<64x64xf32> {
+    air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {src_rank = 0 : i64}
+        : (memref<64x64xf32>, memref<64x64xf32>)
+    air.rank_terminator
+  }
+  return
+}
+
+// -----
+
+// f64 element type: 256 * 8 = 2048 bytes.
+// CHECK-LABEL: func.func @cross_rank_f64
+// CHECK: arith.constant 2048 : i64
+func.func @cross_rank_f64(%dst: memref<256xf64>, %src: memref<256xf64>) {
+  %c2 = arith.constant 2 : index
+  air.rank (%rid) in (%rsize = %c2) args(%d = %dst, %s = %src)
+      : memref<256xf64>, memref<256xf64> {
+    air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {src_rank = 0 : i64}
+        : (memref<256xf64>, memref<256xf64>)
+    air.rank_terminator
+  }
+  return
+}
+
+// -----
+
+// Multiple cross-rank DMAs in one function: extern decls emitted exactly once.
+// Match emission order from ensureExternFunc (insertion-at-top -> reverse).
+// CHECK-COUNT-1: func.func private @mgpuMemcpy
+// CHECK-NOT: func.func private @mgpuMemcpy
+// CHECK-COUNT-1: func.func private @mgpuGetHeapBases
+// CHECK-NOT: func.func private @mgpuGetHeapBases
+// CHECK-COUNT-1: func.func private @mgpuGetRank
+// CHECK-NOT: func.func private @mgpuGetRank
+// CHECK-LABEL: func.func @two_dmas
+// CHECK-COUNT-2: call @mgpuMemcpy
+func.func @two_dmas(%dst: memref<32xf32>, %src: memref<32xf32>) {
+  %c2 = arith.constant 2 : index
+  air.rank (%rid) in (%rsize = %c2) args(%d = %dst, %s = %src)
+      : memref<32xf32>, memref<32xf32> {
+    air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {src_rank = 0 : i64}
+        : (memref<32xf32>, memref<32xf32>)
+    air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {src_rank = 0 : i64}
+        : (memref<32xf32>, memref<32xf32>)
+    air.rank_terminator
+  }
+  return
+}
+
+// -----
+
+// LAST partition: pass is a no-op for non-cross-rank DMAs.
+// CHECK-LABEL: func.func @no_cross_rank
+// CHECK: air.dma_memcpy_nd
+// CHECK-NOT: mgpuMemcpy
+// CHECK-NOT: mgpuGetHeapBases
+func.func @no_cross_rank(%dst: memref<1024xf32, 2>, %src: memref<1024xf32>) {
+  air.dma_memcpy_nd (%dst[] [] [], %src[] [] [])
+      : (memref<1024xf32, 2>, memref<1024xf32>)
+  return
+}
diff --git a/mlir/test/Conversion/AIRGpuChannelToMgpu/gpu_channel.mlir b/mlir/test/Conversion/AIRGpuChannelToMgpu/gpu_channel.mlir
new file mode 100644
index 000000000..64da49ab0
--- /dev/null
+++ b/mlir/test/Conversion/AIRGpuChannelToMgpu/gpu_channel.mlir
@@ -0,0 +1,87 @@
+//===- gpu_channel.mlir -----------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+// RUN: air-opt %s --split-input-file -air-gpu-channel-to-mgpu | FileCheck %s
+
+// Basic put/get pair with peer-rank index. The put becomes a barrier; the
+// get becomes barrier + cross-rank mgpuMemcpy.
+// CHECK-LABEL: func.func @basic_pair
+// CHECK-NOT: air.channel @
+// CHECK: arith.constant 0 : index
+// Inside the rank body: put -> barrier
+// CHECK: call @mgpuBarrier
+// CHECK-NOT: air.channel.put
+// Then: get -> barrier + memcpy with peer-VA addressing.
+// CHECK: call @mgpuBarrier
+// CHECK: arith.constant 4096 : i64
+// CHECK: memref.extract_aligned_pointer_as_index
+// CHECK: memref.extract_aligned_pointer_as_index
+// CHECK: call @mgpuGetHeapBases
+// CHECK: call @mgpuGetRank
+// CHECK: llvm.getelementptr
+// CHECK: llvm.load
+// peer rank = constant 0 (peer index from get).
+// CHECK: arith.index_cast
+// CHECK: llvm.getelementptr
+// CHECK: llvm.load
+// offset = src_int - my_base_int.
+// CHECK: llvm.ptrtoint
+// CHECK: llvm.ptrtoint
+// CHECK: arith.subi
+// peer_src = peer_base + offset (byte stride).
+// CHECK: llvm.getelementptr {{.*}} -> !llvm.ptr, i8
+// CHECK: call @mgpuMemcpy
+// CHECK-NOT: air.channel.get
+air.channel @sym_chan [] {channel_type = "gpu_symmetric_heap"}
+func.func @basic_pair(%src: memref<1024xf32>, %dst: memref<1024xf32>) {
+  %c2 = arith.constant 2 : index
+  air.rank (%rid) in (%rsize = %c2) args(%s = %src, %d = %dst)
+      : memref<1024xf32>, memref<1024xf32> {
+    %c0 = arith.constant 0 : index
+    %sym = memref.alloc() {air.symmetric} : memref<1024xf32>
+    air.channel.put @sym_chan[] (%sym[] [] []) : (memref<1024xf32>)
+    air.channel.get @sym_chan[%c0] (%d[] [] []) : (memref<1024xf32>)
+    memref.dealloc %sym : memref<1024xf32>
+    air.rank_terminator
+  }
+  return
+}
+
+// -----
+
+// Channel decl is erased after lowering (the channel symbol no longer
+// exists in the lowered IR).
+// CHECK-LABEL: func.func @decl_erased
+// CHECK-NOT: air.channel @sym_chan2
+air.channel @sym_chan2 [] {channel_type = "gpu_symmetric_heap"}
+func.func @decl_erased(%dst: memref<32xf32>) {
+  %c2 = arith.constant 2 : index
+  air.rank (%rid) in (%rsize = %c2) args(%d = %dst)
+      : memref<32xf32> {
+    %c0 = arith.constant 0 : index
+    %sym = memref.alloc() {air.symmetric} : memref<32xf32>
+    air.channel.put @sym_chan2[] (%sym[] [] []) : (memref<32xf32>)
+    air.channel.get @sym_chan2[%c0] (%d[] [] []) : (memref<32xf32>)
+    memref.dealloc %sym : memref<32xf32>
+    air.rank_terminator
+  }
+  return
+}
+
+// -----
+
+// LAST partition: pass is a no-op for non-gpu_symmetric_heap channels.
+// (npu_dma_stream channels must be left alone for the AIE backend.)
+// CHECK-LABEL: func.func @no_gpu_channel
+// CHECK: air.channel.put @npu_chan
+// CHECK-NOT: mgpuMemcpy
+// CHECK-NOT: mgpuGetHeapBases
+air.channel @npu_chan [] {channel_type = "npu_dma_stream"}
+func.func @no_gpu_channel(%src: memref<32xf32>) {
+  air.channel.put @npu_chan[] (%src[] [] []) : (memref<32xf32>)
+  return
+}
diff --git a/mlir/test/Conversion/AIRRankToMgpu/rank_to_mgpu.mlir b/mlir/test/Conversion/AIRRankToMgpu/rank_to_mgpu.mlir
new file mode 100644
index 000000000..067547ee4
--- /dev/null
+++ b/mlir/test/Conversion/AIRRankToMgpu/rank_to_mgpu.mlir
@@ -0,0 +1,189 @@
+//===- rank_to_mgpu.mlir ----------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+// RUN: air-opt %s --split-input-file -air-rank-to-mgpu                       | FileCheck %s
+// RUN: air-opt %s --split-input-file -air-rank-to-mgpu='heap-size=536870912' | FileCheck %s --check-prefix=HEAPOPT
+
+// CHECK-LABEL: func.func @test_rank_1d
+// CHECK: call @mgpuSymmetricHeapInit
+// CHECK-NOT: air.rank
+// CHECK: %[[R:.*]] = call @mgpuGetRank() : () -> i32
+// CHECK: arith.extsi %[[R]] : i32 to i64
+// CHECK: arith.index_cast
+// CHECK: call @mgpuSymmetricHeapDestroy
+// CHECK: return
+
+// HEAPOPT-LABEL: func.func @test_rank_1d
+// HEAPOPT: arith.constant 536870912 : i64
+// HEAPOPT: call @mgpuSymmetricHeapInit
+func.func @test_rank_1d(%arg0: memref<16x16xf32>) {
+  %c2 = arith.constant 2 : index
+  air.rank (%rx) in (%sx = %c2) args(%a=%arg0) : memref<16x16xf32> {
+    %c1 = arith.constant 1 : index
+    air.launch (%lx) in (%ls = %c1) args(%la=%a) : memref<16x16xf32> {
+      air.launch_terminator
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @test_rank_2d
+// 2D rank delinearization: id_x = flat % sx, id_y = flat / sx
+// CHECK: %[[FLAT:.*]] = arith.index_cast
+// CHECK: %[[IDX:.*]] = arith.remsi %[[FLAT]], %{{.*}}
+// CHECK: %[[IDY:.*]] = arith.divsi %[[FLAT]], %{{.*}}
+// CHECK-NOT: air.rank
+func.func @test_rank_2d(%arg0: memref<16x16xf32>) {
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  air.rank (%rx, %ry) in (%sx = %c2, %sy = %c4) args(%a=%arg0) : memref<16x16xf32> {
+    %c1 = arith.constant 1 : index
+    air.launch (%lx) in (%ls = %c1) args(%la=%a) : memref<16x16xf32> {
+      air.launch_terminator
+    }
+  }
+  return
+}
+
+// -----
+
+// Default heap size is 256 MB = 268435456.
+// CHECK-LABEL: func.func @test_rank_default_heap
+// CHECK: arith.constant 268435456 : i64
+// CHECK: call @mgpuSymmetricHeapInit
+func.func @test_rank_default_heap() {
+  %c2 = arith.constant 2 : index
+  air.rank (%rx) in (%sx = %c2) {
+  }
+  return
+}
+
+// -----
+
+// Async form: air.rank with async result token. Pass should produce a wait_all
+// to replace the token, and the body should still be inlined.
+// CHECK-LABEL: func.func @test_rank_async
+// CHECK: call @mgpuSymmetricHeapInit
+// CHECK: call @mgpuGetRank
+// CHECK-NOT: air.rank
+// CHECK: air.wait_all
+// CHECK: call @mgpuSymmetricHeapDestroy
+func.func @test_rank_async() -> !air.async.token {
+  %c2 = arith.constant 2 : index
+  %t = air.rank async (%rx) in (%sx = %c2) {
+  }
+  return %t : !air.async.token
+}
+
+// -----
+
+// Async dependency: air.rank async [%dep]. Pass must insert a blocking
+// wait_all on the dependency before lowering the rank body.
+// CHECK-LABEL: func.func @test_rank_async_dep
+// CHECK: %[[DEP:.*]] = air.wait_all async
+// CHECK: air.wait_all [%[[DEP]]]
+// CHECK: call @mgpuGetRank
+// CHECK-NOT: air.rank
+func.func @test_rank_async_dep() {
+  %c2 = arith.constant 2 : index
+  %dep = air.wait_all async
+  %t = air.rank async [%dep] (%rx) in (%sx = %c2) {
+  }
+  return
+}
+
+// -----
+
+// Multiple air.rank ops in one function: heap init should appear once
+// (at function entry) and destroy once (before return), regardless of how
+// many rank ops are inlined. Each rank produces its own mgpuGetRank().
+// CHECK-LABEL: func.func @test_multiple_ranks
+// CHECK-COUNT-1: call @mgpuSymmetricHeapInit
+// CHECK-COUNT-2: call @mgpuGetRank
+// CHECK-COUNT-1: call @mgpuSymmetricHeapDestroy
+// CHECK-NOT: air.rank
+func.func @test_multiple_ranks() {
+  %c2 = arith.constant 2 : index
+  air.rank (%rx) in (%sx = %c2) {
+  }
+  air.rank (%rx) in (%sx = %c2) {
+  }
+  return
+}
+
+// -----
+
+// Multiple returns: destroy should be inserted before EACH return path.
+// CHECK-LABEL: func.func @test_multiple_returns
+// CHECK-COUNT-1: call @mgpuSymmetricHeapInit
+// CHECK-COUNT-2: call @mgpuSymmetricHeapDestroy
+func.func @test_multiple_returns(%cond: i1) {
+  %c2 = arith.constant 2 : index
+  air.rank (%rx) in (%sx = %c2) {
+  }
+  cf.cond_br %cond, ^bb1, ^bb2
+^bb1:
+  return
+^bb2:
+  return
+}
+
+// -----
+
+// Kernel operand mapping: a value passed as args(%a=%arg0) should be
+// substituted into the inlined body so that uses of the block arg are
+// replaced with the original SSA value.
+// CHECK-LABEL: func.func @test_kernel_args(
+// CHECK-SAME: %[[ARG0:.*]]: memref<16x16xf32>
+// CHECK-NOT: air.rank
+// The store should reference the function arg directly, not a block arg.
+// CHECK: memref.store %{{.*}}, %[[ARG0]]
+func.func @test_kernel_args(%arg0: memref<16x16xf32>) {
+  %c2 = arith.constant 2 : index
+  air.rank (%rx) in (%sx = %c2) args(%a=%arg0) : memref<16x16xf32> {
+    %cst = arith.constant 0.0 : f32
+    %c0 = arith.constant 0 : index
+    memref.store %cst, %a[%c0, %c0] : memref<16x16xf32>
+  }
+  return
+}
+
+// -----
+
+// Idempotent extern decls: only one decl of each mgpu* function in the
+// module, even with multiple ranks across multiple functions.
+// CHECK-COUNT-1: func.func private @mgpuGetRank
+// CHECK-NOT: func.func private @mgpuGetRank
+// CHECK-COUNT-1: func.func private @mgpuSymmetricHeapDestroy
+// CHECK-NOT: func.func private @mgpuSymmetricHeapDestroy
+// CHECK-COUNT-1: func.func private @mgpuSymmetricHeapInit
+// CHECK-NOT: func.func private @mgpuSymmetricHeapInit
+func.func @test_decls_in_func_a() {
+  %c2 = arith.constant 2 : index
+  air.rank (%rx) in (%sx = %c2) {}
+  return
+}
+func.func @test_decls_in_func_b() {
+  %c2 = arith.constant 2 : index
+  air.rank (%rx) in (%sx = %c2) {}
+  return
+}
+
+// -----
+
+// A function with NO air.rank should be left completely untouched.
+// (Placed last in the file so CHECK-NOTs aren't matched against later
+// partitions that legitimately contain mgpu* decls.)
+// CHECK-LABEL: func.func @test_no_rank
+// CHECK-NOT: mgpuSymmetricHeapInit
+// CHECK-NOT: mgpuSymmetricHeapDestroy
+// CHECK-NOT: mgpuGetRank
+func.func @test_no_rank(%arg0: memref<16x16xf32>) -> memref<16x16xf32> {
+  return %arg0 : memref<16x16xf32>
+}
diff --git a/mlir/test/Conversion/AIRSymmetricAllocToMgpu/symmetric_alloc.mlir b/mlir/test/Conversion/AIRSymmetricAllocToMgpu/symmetric_alloc.mlir
new file mode 100644
index 000000000..b0e30f0c7
--- /dev/null
+++ b/mlir/test/Conversion/AIRSymmetricAllocToMgpu/symmetric_alloc.mlir
@@ -0,0 +1,106 @@
+//===- symmetric_alloc.mlir -------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+// RUN: air-opt %s --split-input-file -air-symmetric-alloc-to-mgpu | FileCheck %s
+
+// Basic 1D alloc + dealloc.
+// CHECK-LABEL: func.func @basic_alloc_dealloc
+// CHECK: %[[SZ:.*]] = arith.constant 4096 : i64
+// CHECK: %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
+// CHECK: %[[PTR:.*]] = call @mgpuSymmetricAlloc(%[[SZ]], %[[NULL]]) : (i64, !llvm.ptr) -> !llvm.ptr
+// Descriptor build (poison + insertvalue) then unrealized cast.
+// CHECK: llvm.mlir.poison
+// CHECK: llvm.insertvalue %[[PTR]]
+// CHECK: llvm.insertvalue %[[PTR]]
+// CHECK: builtin.unrealized_conversion_cast {{.*}} : !llvm.struct<{{.*}}> to memref<1024xf32>
+// Dealloc -> mgpuSymmetricFree.
+// CHECK: call @mgpuSymmetricFree(%[[PTR]],
+// CHECK-NOT: memref.alloc
+// CHECK-NOT: memref.dealloc
+func.func @basic_alloc_dealloc() {
+  %buf = memref.alloc() {air.symmetric} : memref<1024xf32>
+  memref.dealloc %buf : memref<1024xf32>
+  return
+}
+
+// -----
+
+// 2D alloc: 64*64*4 = 16384 bytes; descriptor strides should be [64, 1].
+// CHECK-LABEL: func.func @alloc_2d
+// CHECK: arith.constant 16384 : i64
+// CHECK: call @mgpuSymmetricAlloc
+// Strides 64 then 1 in the descriptor (innermost-most-contiguous).
+// CHECK: llvm.mlir.constant(64 : i64)
+// CHECK: llvm.insertvalue
+// CHECK: llvm.mlir.constant(1 : i64)
+// CHECK: llvm.insertvalue
+// CHECK: builtin.unrealized_conversion_cast {{.*}} : !llvm.struct<{{.*}}> to memref<64x64xf32>
+func.func @alloc_2d() -> memref<64x64xf32> {
+  %buf = memref.alloc() {air.symmetric} : memref<64x64xf32>
+  return %buf : memref<64x64xf32>
+}
+
+// -----
+
+// f64 element type (8 bytes): 1024 * 8 = 8192 bytes.
+// CHECK-LABEL: func.func @f64_element
+// CHECK: arith.constant 8192 : i64
+func.func @f64_element() {
+  %buf = memref.alloc() {air.symmetric} : memref<1024xf64>
+  memref.dealloc %buf : memref<1024xf64>
+  return
+}
+
+// -----
+
+// i32 element type (4 bytes): 256 * 4 = 1024 bytes.
+// CHECK-LABEL: func.func @i32_element
+// CHECK: arith.constant 1024 : i64
+func.func @i32_element() {
+  %buf = memref.alloc() {air.symmetric} : memref<256xi32>
+  memref.dealloc %buf : memref<256xi32>
+  return
+}
+
+// -----
+
+// Multiple symmetric allocs in one function: each lowered independently;
+// extern decls are emitted exactly once at module scope.
+// Match the actual emission order: Free decl before Alloc decl.
+// CHECK-COUNT-1: func.func private @mgpuSymmetricFree
+// CHECK-NOT: func.func private @mgpuSymmetricFree
+// CHECK-COUNT-1: func.func private @mgpuSymmetricAlloc
+// CHECK-NOT: func.func private @mgpuSymmetricAlloc
+// CHECK-LABEL: func.func @two_allocs
+// CHECK-COUNT-2: call @mgpuSymmetricAlloc
+// CHECK-COUNT-2: call @mgpuSymmetricFree
+func.func @two_allocs() {
+  %a = memref.alloc() {air.symmetric} : memref<32xf32>
+  %b = memref.alloc() {air.symmetric} : memref<64xf32>
+  memref.dealloc %a : memref<32xf32>
+  memref.dealloc %b : memref<64xf32>
+  return
+}
+
+// -----
+
+// LAST partition: cases that test the pass leaves things untouched.
+// Both `ignores_non_symmetric` and `no_symmetric_alloc` are folded here
+// so the trailing CHECK-NOTs only need to match against this one (final)
+// partition's text.
+// CHECK-LABEL: func.func @no_symmetric_changes
+// CHECK: memref.alloc() : memref<1024xf32>
+// CHECK: memref.alloc() : memref<32xf32>
+// CHECK-NOT: mgpuSymmetricAlloc
+// CHECK-NOT: mgpuSymmetricFree
+func.func @no_symmetric_changes() {
+  %a = memref.alloc() : memref<1024xf32>
+  memref.dealloc %a : memref<1024xf32>
+  %b = memref.alloc() : memref<32xf32>
+  memref.dealloc %b : memref<32xf32>
+  return
+}
diff --git a/mlir/test/Conversion/AIRToROCDL/air_translate_to_llvm.mlir b/mlir/test/Conversion/AIRToROCDL/air_translate_to_llvm.mlir
new file mode 100644
index 000000000..84f96db8c
--- /dev/null
+++ b/mlir/test/Conversion/AIRToROCDL/air_translate_to_llvm.mlir
@@ -0,0 +1,86 @@
+//===- air_translate_to_llvm.mlir - air-translate-to-llvm pass -----------===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+// REQUIRES: gpu
+// RUN: air-opt --air-translate-to-llvm --split-input-file %s | FileCheck %s
+
+// 1D static memref: full peer-VA expansion shape.
+// CHECK-LABEL: func.func @translate_1d
+// CHECK-DAG:   %[[SRC_IDX:.+]]   = memref.extract_aligned_pointer_as_index %arg0
+// CHECK-DAG:   %[[FROM_BASE:.+]] = memref.load %arg3[%arg1] : memref<?xindex>
+// CHECK-DAG:   %[[TO_BASE:.+]]   = memref.load %arg3[%arg2] : memref<?xindex>
+// CHECK:       %[[DIFF:.+]]      = arith.subi %[[TO_BASE]], %[[FROM_BASE]]
+// CHECK:       %[[PEER_IDX:.+]]  = arith.addi %[[SRC_IDX]], %[[DIFF]]
+// CHECK:       %[[PEER_I64:.+]]  = arith.index_cast %[[PEER_IDX]] : index to i64
+// CHECK:       %[[PEER_PTR:.+]]  = llvm.inttoptr %[[PEER_I64]] : i64 to !llvm.ptr
+// CHECK:       %[[POISON:.+]]    = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK:       %[[D0:.+]] = llvm.insertvalue %[[PEER_PTR]], %[[POISON]][0]
+// CHECK:       %[[D1:.+]] = llvm.insertvalue %[[PEER_PTR]], %[[D0]][1]
+// CHECK:       %{{.*}}    = llvm.mlir.constant(0 : i64)
+// CHECK:       %[[D2:.+]] = llvm.insertvalue %{{.*}}, %[[D1]][2]
+// CHECK:       %{{.*}}    = llvm.mlir.constant(1024 : i64)
+// CHECK:       %[[D3:.+]] = llvm.insertvalue %{{.*}}, %[[D2]][3, 0]
+// CHECK:       %{{.*}}    = llvm.mlir.constant(1 : i64)
+// CHECK:       %[[D4:.+]] = llvm.insertvalue %{{.*}}, %[[D3]][4, 0]
+// CHECK:       %[[CAST:.+]] = builtin.unrealized_conversion_cast %[[D4]] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<1024xf32>
+// CHECK:       return %[[CAST]]
+// CHECK-NOT:   air.translate
+func.func @translate_1d(%src : memref<1024xf32>, %from : index, %to : index, %bases : memref<?xindex>) -> memref<1024xf32> {
+  %peer = air.translate %src, %from, %to, %bases : memref<1024xf32>, memref<?xindex>
+  return %peer : memref<1024xf32>
+}
+
+// -----
+
+// 2D static memref: descriptor includes row-major strides [64, 1].
+// CHECK-LABEL: func.func @translate_2d
+// CHECK:       memref.load %arg3[%arg1] : memref<?xindex>
+// CHECK:       memref.load %arg3[%arg2] : memref<?xindex>
+// CHECK:       llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+// CHECK-DAG:   llvm.mlir.constant(64 : i64)
+// CHECK-DAG:   llvm.mlir.constant(1 : i64)
+// CHECK:       builtin.unrealized_conversion_cast {{.*}} to memref<64x64xf32, 1>
+// CHECK-NOT:   air.translate
+func.func @translate_2d(%src : memref<64x64xf32, 1>, %from : index, %to : index, %bases : memref<?xindex>) -> memref<64x64xf32, 1> {
+  %peer = air.translate %src, %from, %to, %bases : memref<64x64xf32, 1>, memref<?xindex>
+  return %peer : memref<64x64xf32, 1>
+}
+
+// -----
+
+// Inside a gpu.func (kernel-side use): same expansion shape — purely
+// memref + arith ops, no runtime call.
+// CHECK-LABEL: gpu.func @kernel
+// CHECK:       memref.extract_aligned_pointer_as_index
+// CHECK:       memref.load %arg3[%arg1] : memref<?xindex>
+// CHECK:       memref.load %arg3[%arg2] : memref<?xindex>
+// CHECK:       arith.subi
+// CHECK:       arith.addi
+// CHECK:       builtin.unrealized_conversion_cast {{.*}} to memref<1024xf32, 1>
+// CHECK:       memref.store
+// CHECK-NOT:   air.translate
+gpu.module @kernels {
+  gpu.func @kernel(%data : memref<1024xf32, 1>, %from : index, %to : index, %bases : memref<?xindex>) kernel {
+    %peer = air.translate %data, %from, %to, %bases : memref<1024xf32, 1>, memref<?xindex>
+    %c0 = arith.constant 0 : index
+    %c42 = arith.constant 42.0 : f32
+    memref.store %c42, %peer[%c0] : memref<1024xf32, 1>
+    gpu.return
+  }
+}
+
+// -----
+
+// No air.translate: pass is a no-op.
+// CHECK-LABEL: func.func @noop
+// CHECK-NEXT:    return
+// CHECK-NOT:   memref.extract_aligned_pointer_as_index
+// CHECK-NOT:   llvm.mlir.poison
+func.func @noop(%a : memref<8xf32>) -> memref<8xf32> {
+  return %a : memref<8xf32>
+}
+
diff --git a/mlir/test/Conversion/AIRToROCDL/sym_atomic_syncscope.mlir b/mlir/test/Conversion/AIRToROCDL/sym_atomic_syncscope.mlir
new file mode 100644
index 000000000..4092f6fa5
--- /dev/null
+++ b/mlir/test/Conversion/AIRToROCDL/sym_atomic_syncscope.mlir
@@ -0,0 +1,40 @@
+//===- sym_atomic_syncscope.mlir - cross-XGMI atomic preservation --------===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// The symmetric-heap producer/consumer test relies on a contract that
+// `llvm.atomicrmw release` and `llvm.load atomic acquire` ops emitted with
+// `syncscope("")` (= LLVM IR's System scope = cross-device on AMDGPU)
+// survive the GPU compilation pipeline unchanged. Without that, the
+// producer's release-store on rank 0's GPU is not seen by the consumer's
+// acquire-load on rank 1's GPU, and the consumer hangs forever (test
+// times out — appears as "no crash, no signal, just dead").
+//
+// The empty-string syncscope is LLVM IR's canonical spelling of System
+// scope (LLVM's textual IR omits the `syncscope(...)` token entirely when
+// scope == System; MLIR's LLVM dialect round-trips it as `syncscope("")`).
+// AMDGPU's LangRef defines System as cross-device:
+//   https://llvm.org/docs/AMDGPUUsage.html#memory-model
+//
+// This test asserts that after `convert-gpu-to-rocdl` the atomic ops
+// retain their ordering and the explicit `syncscope("")` qualifier.
+//
+//===-----------------------------------------------------------------------===//
+
+// REQUIRES: gpu
+// RUN: air-opt --pass-pipeline='builtin.module(rocdl-attach-target{chip=gfx942 O=3},gpu.module(convert-scf-to-cf,convert-gpu-to-rocdl{chipset=gfx942 runtime=HIP},reconcile-unrealized-casts))' %s | FileCheck %s
+
+// CHECK-LABEL: gpu.module @kernels
+// CHECK-LABEL: llvm.func @atomic_kernel
+// CHECK:       llvm.atomicrmw xchg %{{.*}}, %{{.*}} syncscope("") release : !llvm.ptr, i32
+// CHECK:       llvm.load %{{.*}} atomic syncscope("") acquire {{.*}} : !llvm.ptr -> i32
+gpu.module @kernels {
+  gpu.func @atomic_kernel(%ptr : !llvm.ptr, %v : i32) kernel {
+    %old = llvm.atomicrmw xchg %ptr, %v syncscope("") release : !llvm.ptr, i32
+    %loaded = llvm.load %ptr atomic syncscope("") acquire {alignment = 4 : i64} : !llvm.ptr -> i32
+    gpu.return
+  }
+}
diff --git a/mlir/test/Dialect/AIR/air_translate.mlir b/mlir/test/Dialect/AIR/air_translate.mlir
new file mode 100644
index 000000000..c107da0c8
--- /dev/null
+++ b/mlir/test/Dialect/AIR/air_translate.mlir
@@ -0,0 +1,63 @@
+//===- air_translate.mlir - air.translate parser, printer, folder --------===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+
+// RUN: air-opt %s | FileCheck %s
+// RUN: air-opt --canonicalize %s | FileCheck %s --check-prefix=FOLD
+
+// Round-trip: 1D static memref.
+// CHECK-LABEL: func.func @translate_1d
+// CHECK: %{{.*}} = air.translate %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<1024xf32>, memref<?xindex>
+func.func @translate_1d(%src : memref<1024xf32>, %from : index, %to : index, %bases : memref<?xindex>) -> memref<1024xf32> {
+  %peer = air.translate %src, %from, %to, %bases : memref<1024xf32>, memref<?xindex>
+  return %peer : memref<1024xf32>
+}
+
+// Round-trip: 2D static memref in address space 1.
+// CHECK-LABEL: func.func @translate_2d_addrspace
+// CHECK: air.translate %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<64x64xf32, 1>, memref<?xindex>
+func.func @translate_2d_addrspace(%src : memref<64x64xf32, 1>, %from : index, %to : index, %bases : memref<?xindex>) -> memref<64x64xf32, 1> {
+  %peer = air.translate %src, %from, %to, %bases : memref<64x64xf32, 1>, memref<?xindex>
+  return %peer : memref<64x64xf32, 1>
+}
+
+// Round-trip: static-shaped heap_bases is also accepted.
+// CHECK-LABEL: func.func @translate_static_bases
+// CHECK: air.translate %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<8xf32>, memref<8xindex>
+func.func @translate_static_bases(%src : memref<8xf32>, %from : index, %to : index, %bases : memref<8xindex>) -> memref<8xf32> {
+  %peer = air.translate %src, %from, %to, %bases : memref<8xf32>, memref<8xindex>
+  return %peer : memref<8xf32>
+}
+
+// Folder: from_rank == to_rank (same SSA value) folds to %src.
+// FOLD-LABEL: func.func @fold_same_rank
+// FOLD-NOT: air.translate
+// FOLD: return %arg0 : memref<8xf32>
+func.func @fold_same_rank(%src : memref<8xf32>, %r : index, %bases : memref<?xindex>) -> memref<8xf32> {
+  %peer = air.translate %src, %r, %r, %bases : memref<8xf32>, memref<?xindex>
+  return %peer : memref<8xf32>
+}
+
+// Folder: distinct constants with same value also fold.
+// FOLD-LABEL: func.func @fold_constant_eq_ranks
+// FOLD-NOT: air.translate
+// FOLD: return %arg0 : memref<8xf32>
+func.func @fold_constant_eq_ranks(%src : memref<8xf32>, %bases : memref<?xindex>) -> memref<8xf32> {
+  %c2 = arith.constant 2 : index
+  %c2_again = arith.constant 2 : index
+  %peer = air.translate %src, %c2, %c2_again, %bases : memref<8xf32>, memref<?xindex>
+  return %peer : memref<8xf32>
+}
+
+// Non-fold: distinct constants do NOT fold.
+// FOLD-LABEL: func.func @no_fold_distinct_constants
+// FOLD: air.translate
+func.func @no_fold_distinct_constants(%src : memref<8xf32>, %bases : memref<?xindex>) -> memref<8xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %peer = air.translate %src, %c0, %c1, %bases : memref<8xf32>, memref<?xindex>
+  return %peer : memref<8xf32>
+}
diff --git a/test/gpu/symmetric_heap_dma/air_sym_handwritten_atomic.mlir b/test/gpu/symmetric_heap_dma/air_sym_handwritten_atomic.mlir
new file mode 100644
index 000000000..a0743e60c
--- /dev/null
+++ b/test/gpu/symmetric_heap_dma/air_sym_handwritten_atomic.mlir
@@ -0,0 +1,330 @@
+//===- air_sym_handwritten_atomic.mlir - multi-GPU e2e (atomic flag) ------===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===------------------------------------------------------------------===//
+//
+// Symmetric-heap producer/consumer e2e (WORLD_SIZE=2), atomic-flag variant.
+// Sister file: air_sym_handwritten_cacheline.mlir uses cache-line atomicity
+// instead of LLVM atomics for the cross-rank handoff.
+//
+//   rank 0 launches @producer; rank 1 launches @consumer.
+//   producer writes 42.0 into rank 1's `data` over XGMI; per-warp flags
+//   (4 i32, in rank 1's HBM) signal completion via release atomicrmw with
+//   syncscope("") (= LLVM System scope = cross-device on AMDGPU).
+//   consumer's lane 0 acquires on its flag, then all 64 lanes copy
+//   the local data slot to verify_buf for host check.
+//   Block: 1 grid × 256 threads = 4 warps × 64 lanes.
+//
+// Synchronization contract is spec-defined: see sym_atomic_syncscope.mlir
+// for the FileCheck contract test that pins the lowering behavior.
+//
+// Launcher: run.sh forks N processes with RANK / WORLD_SIZE / LOCAL_RANK.
+//
+//===------------------------------------------------------------------===//
+
+module attributes {gpu.container_module} {
+  // ---- mgpu* C ABI declarations -----------------------------------------
+  func.func private @mgpuSymmetricHeapInit(i64)
+  func.func private @mgpuSymmetricHeapDestroy()
+  func.func private @mgpuGetRank() -> i32
+  func.func private @mgpuGetWorldSize() -> i32
+  func.func private @mgpuSymmetricAlloc(i64, !llvm.ptr) -> !llvm.ptr
+  func.func private @mgpuSymmetricFree(!llvm.ptr, !llvm.ptr)
+  func.func private @mgpuGetHeapBases() -> !llvm.ptr
+  func.func private @mgpuBarrier()
+  func.func private @mgpuMemAlloc(i64, !llvm.ptr, i1) -> !llvm.ptr
+  func.func private @mgpuMemFree(!llvm.ptr, !llvm.ptr)
+  func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr)
+
+  // libc exit — verify branch calls this on any mismatch so run.sh
+  // sees a non-zero process exit (no green-without-validation).
+  func.func private @exit(i32)
+
+  llvm.func @printf(!llvm.ptr, ...) -> i32
+
+  llvm.mlir.global internal constant @msg_init(
+      "[mlir] rank %d / world %d, init OK\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_pass_p(
+      "[mlir] rank 0 (producer): kernel returned\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_pass_c(
+      "[mlir] rank 1 (consumer): cross-rank kernel write PASS (verify[0]=%.1f)\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_fail(
+      "[mlir] rank 1 (consumer): MISMATCH at idx=%ld got=%.1f expected=42.0\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_done(
+      "[mlir] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32}
+
+  // ---- GPU kernels ------------------------------------------------------
+  gpu.module @sym_kernels {
+
+    // Drop a memref<4xi32> + warp index to a raw !llvm.ptr to the warp's
+    // flag slot. We must drop to llvm.ptr because memref dialect atomics
+    // (memref.atomic_rmw, memref.generic_atomic_rmw) lack ordering and
+    // syncscope today, and there is no memref.atomic_load/store at all.
+    // TODO: when upstream memref grows ordering+syncscope (track in
+    // mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td), inline this and
+    // use the memref-level ops directly.
+    func.func private @flag_slot_ptr(%flags : memref<4xi32>, %wid : index) -> !llvm.ptr {
+      %p_idx = memref.extract_aligned_pointer_as_index %flags : memref<4xi32> -> index
+      %p_int = arith.index_cast %p_idx : index to i64
+      %p     = llvm.inttoptr %p_int : i64 to !llvm.ptr
+      %w64   = arith.index_cast %wid : index to i64
+      %slot  = llvm.getelementptr %p[%w64] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+      return %slot : !llvm.ptr
+    }
+
+    // Producer: each thread stores 42.0 into peer's data; lane 0 of each
+    // warp release-atomicrmws peer's per-warp flag.
+    gpu.func @producer(%data : memref<256xf32>,
+                       %flags : memref<4xi32>,
+                       %bases : memref<?xindex>) kernel
+                       attributes {gpu.known_block_size = array<i32: 256, 1, 1>,
+                                   gpu.known_grid_size  = array<i32: 1, 1, 1>} {
+      %c0 = arith.constant 0 : index
+      %c64 = arith.constant 64 : index
+      %c1_i32 = arith.constant 1 : i32
+      %c42_f = arith.constant 42.0 : f32
+      %from = arith.constant 0 : index   // rank 0 (producer)
+      %to   = arith.constant 1 : index   // rank 1 (consumer)
+
+      %tid = gpu.thread_id x
+      %wid = arith.divui %tid, %c64 : index
+      %lane = arith.remui %tid, %c64 : index
+
+      %peer_data  = air.translate %data,  %from, %to, %bases : memref<256xf32>, memref<?xindex>
+      %peer_flags = air.translate %flags, %from, %to, %bases : memref<4xi32>,   memref<?xindex>
+      memref.store %c42_f, %peer_data[%tid] : memref<256xf32>
+
+      %is_lane0 = arith.cmpi eq, %lane, %c0 : index
+      scf.if %is_lane0 {
+        // syncscope("") = LLVM System scope = cross-device on AMDGPU.
+        // See sym_atomic_syncscope.mlir for the contract test.
+        %slot_ptr = func.call @flag_slot_ptr(%peer_flags, %wid)
+            : (memref<4xi32>, index) -> !llvm.ptr
+        %old = llvm.atomicrmw xchg %slot_ptr, %c1_i32 syncscope("") release
+            : !llvm.ptr, i32
+      }
+      gpu.return
+    }
+
+    // Consumer: lane 0 acquires on its flag; then all 64 lanes copy
+    // their data slot into verify_buf for host check.
+    gpu.func @consumer(%data       : memref<256xf32>,
+                       %verify_buf : memref<256xf32>,
+                       %flags      : memref<4xi32>) kernel
+                       attributes {gpu.known_block_size = array<i32: 256, 1, 1>,
+                                   gpu.known_grid_size  = array<i32: 1, 1, 1>} {
+      %c0 = arith.constant 0 : index
+      %c64 = arith.constant 64 : index
+      %c0_i32 = arith.constant 0 : i32
+
+      %tid = gpu.thread_id x
+      %wid = arith.divui %tid, %c64 : index
+      %lane = arith.remui %tid, %c64 : index
+
+      %is_lane0 = arith.cmpi eq, %lane, %c0 : index
+      scf.if %is_lane0 {
+        %slot_ptr = func.call @flag_slot_ptr(%flags, %wid)
+            : (memref<4xi32>, index) -> !llvm.ptr
+        // Spin: flag == 0.
+        scf.while : () -> () {
+          %v = llvm.load %slot_ptr atomic syncscope("") acquire
+              {alignment = 4 : i64} : !llvm.ptr -> i32
+          %not_ready = arith.cmpi eq, %v, %c0_i32 : i32
+          scf.condition(%not_ready)
+        } do {
+          scf.yield
+        }
+      }
+      // No gpu.barrier: on AMDGPU lanes within a wave execute in SIMT
+      // lockstep, so lanes 1..63 cannot leave the scf.if before lane 0
+      // does, and the wave-shared L1 means lane 0's syncscope("") acquire
+      // makes the producer's writes visible to the whole wave.
+      %v = memref.load %data[%tid] : memref<256xf32>
+      memref.store %v, %verify_buf[%tid] : memref<256xf32>
+      gpu.return
+    }
+  }
+
+  // ---- Helpers ----------------------------------------------------------
+  // Single ABI-leaking helper: wrap a raw runtime !llvm.ptr as a 1-D byte
+  // memref. All typed views below derive from this via memref.view, so the
+  // hand-built LLVM-struct descriptor literal lives in exactly one place.
+  // Phase 4's AIRSymmetricAllocToMgpuPass will replace this entirely.
+  func.func private @wrap_bytes(%ptr : !llvm.ptr, %size : i64) -> memref<?xi8> {
+    %c0_i64 = arith.constant 0 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %d0 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %d1 = llvm.insertvalue %ptr,    %d0[0]    : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %d2 = llvm.insertvalue %ptr,    %d1[1]    : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %d3 = llvm.insertvalue %c0_i64, %d2[2]    : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %d4 = llvm.insertvalue %size,   %d3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %d5 = llvm.insertvalue %c1_i64, %d4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %m  = builtin.unrealized_conversion_cast %d5
+        : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<?xi8>
+    return %m : memref<?xi8>
+  }
+
+  // ---- main ------------------------------------------------------------
+  func.func @main() {
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %c1024_bytes = arith.constant 1024 : i64   // 256 f32 = 1024 bytes
+    %c16_bytes   = arith.constant 16   : i64   // 4 i32  = 16 bytes
+    %heap_size   = arith.constant 268435456 : i64  // 256 MB
+    %nullptr = llvm.mlir.zero : !llvm.ptr
+    %false = arith.constant false
+
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+
+    // Heap init (collective).
+    func.call @mgpuSymmetricHeapInit(%heap_size) : (i64) -> ()
+    %rank = func.call @mgpuGetRank() : () -> i32
+    %world = func.call @mgpuGetWorldSize() : () -> i32
+    %fmt_init = llvm.mlir.addressof @msg_init : !llvm.ptr
+    llvm.call @printf(%fmt_init, %rank, %world)
+        vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32) -> i32
+
+    // Symmetric allocations: data (256 f32) + flags (4 i32).
+    %data_ptr  = func.call @mgpuSymmetricAlloc(%c1024_bytes, %nullptr) : (i64, !llvm.ptr) -> !llvm.ptr
+    %flags_ptr = func.call @mgpuSymmetricAlloc(%c16_bytes,   %nullptr) : (i64, !llvm.ptr) -> !llvm.ptr
+
+    // Zero-init flags from host so the consumer's spin starts at 0.
+    %flags_host = memref.alloc() : memref<4xi32>
+    %fc0 = arith.constant 0 : index
+    %fc1 = arith.constant 1 : index
+    %fc4 = arith.constant 4 : index
+    scf.for %i = %fc0 to %fc4 step %fc1 {
+      memref.store %c0_i32, %flags_host[%i] : memref<4xi32>
+    }
+    %flags_host_intptr = memref.extract_aligned_pointer_as_index %flags_host
+        : memref<4xi32> -> index
+    %flags_host_int = arith.index_cast %flags_host_intptr : index to i64
+    %flags_host_ptr = llvm.inttoptr %flags_host_int : i64 to !llvm.ptr
+    func.call @mgpuMemcpy(%flags_ptr, %flags_host_ptr, %c16_bytes, %nullptr)
+        : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+    memref.dealloc %flags_host : memref<4xi32>
+
+    func.call @mgpuBarrier() : () -> ()  // flags init visible to all ranks
+
+    %c0_view = arith.constant 0 : index
+    %data_bytes  = func.call @wrap_bytes(%data_ptr,  %c1024_bytes) : (!llvm.ptr, i64) -> memref<?xi8>
+    %flags_bytes = func.call @wrap_bytes(%flags_ptr, %c16_bytes)   : (!llvm.ptr, i64) -> memref<?xi8>
+    %data_m  = memref.view %data_bytes[%c0_view][]  : memref<?xi8> to memref<256xf32>
+    %flags_m = memref.view %flags_bytes[%c0_view][] : memref<?xi8> to memref<4xi32>
+
+    // mgpuGetHeapBases() returns a HOST pointer; GPU can't deref it, so
+    // copy to device. TODO(airgpu): make heap_bases device-accessible
+    // (hipMallocManaged / hipHostMalloc-Mapped) and drop this copy.
+    %world_i64 = arith.extui %world : i32 to i64
+    %c8_i64 = arith.constant 8 : i64
+    %bases_size = arith.muli %world_i64, %c8_i64 : i64
+    %bases_host = func.call @mgpuGetHeapBases() : () -> !llvm.ptr
+    %bases_devptr = func.call @mgpuMemAlloc(%bases_size, %nullptr, %false)
+        : (i64, !llvm.ptr, i1) -> !llvm.ptr
+    func.call @mgpuMemcpy(%bases_devptr, %bases_host, %bases_size, %nullptr)
+        : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+    %bases_bytes = func.call @wrap_bytes(%bases_devptr, %bases_size) : (!llvm.ptr, i64) -> memref<?xi8>
+    %world_idx = arith.index_cast %world_i64 : i64 to index
+    %bases = memref.view %bases_bytes[%c0_view][%world_idx] : memref<?xi8> to memref<?xindex>
+
+    // Rank 0 = producer, rank 1 = consumer. Ranks > 1 idle.
+    // (Future: extend to all-pairs producer/consumer mesh.)
+    // Precondition: world >= 2 — enforced by run.sh, not re-checked here.
+    %is_producer = arith.cmpi eq, %rank, %c0_i32 : i32
+    scf.if %is_producer {
+      gpu.launch_func @sym_kernels::@producer
+          blocks  in (%c1, %c1, %c1)
+          threads in (%c256, %c1, %c1)
+          args(%data_m  : memref<256xf32>,
+               %flags_m : memref<4xi32>,
+               %bases   : memref<?xindex>)
+      %fmt_p = llvm.mlir.addressof @msg_pass_p : !llvm.ptr
+      llvm.call @printf(%fmt_p)
+          vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr) -> i32
+    } else {
+      %is_consumer = arith.cmpi eq, %rank, %c1_i32 : i32
+      scf.if %is_consumer {
+        %verify_ptr = func.call @mgpuMemAlloc(%c1024_bytes, %nullptr, %false)
+            : (i64, !llvm.ptr, i1) -> !llvm.ptr
+        %verify_bytes = func.call @wrap_bytes(%verify_ptr, %c1024_bytes) : (!llvm.ptr, i64) -> memref<?xi8>
+        %verify_m = memref.view %verify_bytes[%c0_view][] : memref<?xi8> to memref<256xf32>
+        gpu.launch_func @sym_kernels::@consumer
+            blocks  in (%c1, %c1, %c1)
+            threads in (%c256, %c1, %c1)
+            args(%data_m  : memref<256xf32>,
+                 %verify_m: memref<256xf32>,
+                 %flags_m : memref<4xi32>)
+
+        // D2H readback verify_buf and check ALL 256 elements == 42.0.
+        // (Checking only element 0 would mask a bug where warps 1..3
+        // didn't write their slice. exit(1) on mismatch makes the
+        // multi-process driver see a non-zero exit code.)
+        %hb = memref.alloc() : memref<256xf32>
+        %hb_intptr = memref.extract_aligned_pointer_as_index %hb : memref<256xf32> -> index
+        %hb_int = arith.index_cast %hb_intptr : index to i64
+        %hb_ptr = llvm.inttoptr %hb_int : i64 to !llvm.ptr
+        func.call @mgpuMemcpy(%hb_ptr, %verify_ptr, %c1024_bytes, %nullptr)
+            : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+
+        %c0_idx = arith.constant 0 : index
+        %c1_idx = arith.constant 1 : index
+        %c256_idx = arith.constant 256 : index
+        %expected = arith.constant 42.0 : f32
+
+        // Count mismatches; print msg_fail on the first.
+        %nfail = scf.for %i = %c0_idx to %c256_idx step %c1_idx
+                        iter_args(%nfail_acc = %c0_i32) -> (i32) {
+          %v = memref.load %hb[%i] : memref<256xf32>
+          %ne = arith.cmpf une, %v, %expected : f32
+          %new_nfail = scf.if %ne -> i32 {
+            %is_first = arith.cmpi eq, %nfail_acc, %c0_i32 : i32
+            scf.if %is_first {
+              %fmt_fail = llvm.mlir.addressof @msg_fail : !llvm.ptr
+              %i_i64 = arith.index_cast %i : index to i64
+              %v_64 = arith.extf %v : f32 to f64
+              %e_64 = arith.extf %expected : f32 to f64
+              llvm.call @printf(%fmt_fail, %rank, %i_i64, %v_64, %e_64)
+                  vararg(!llvm.func<i32 (ptr, ...)>)
+                  : (!llvm.ptr, i32, i64, f64, f64) -> i32
+            }
+            %inc = arith.addi %nfail_acc, %c1_i32 : i32
+            scf.yield %inc : i32
+          } else {
+            scf.yield %nfail_acc : i32
+          }
+          scf.yield %new_nfail : i32
+        }
+
+        %ok_all = arith.cmpi eq, %nfail, %c0_i32 : i32
+        scf.if %ok_all {
+          %fmt_c = llvm.mlir.addressof @msg_pass_c : !llvm.ptr
+          %v0 = memref.load %hb[%c0_idx] : memref<256xf32>
+          %v0_64 = arith.extf %v0 : f32 to f64
+          llvm.call @printf(%fmt_c, %v0_64)
+              vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, f64) -> i32
+        } else {
+          func.call @exit(%c1_i32) : (i32) -> ()
+        }
+
+        memref.dealloc %hb : memref<256xf32>
+        func.call @mgpuMemFree(%verify_ptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> ()
+      }
+    }
+
+    func.call @mgpuBarrier() : () -> ()
+    func.call @mgpuMemFree(%bases_devptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> ()
+    func.call @mgpuSymmetricFree(%data_ptr,  %nullptr) : (!llvm.ptr, !llvm.ptr) -> ()
+    func.call @mgpuSymmetricFree(%flags_ptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> ()
+    func.call @mgpuSymmetricHeapDestroy() : () -> ()
+
+    %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr
+    llvm.call @printf(%fmt_done, %rank)
+        vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32
+
+    return
+  }
+}
diff --git a/test/gpu/symmetric_heap_dma/air_sym_handwritten_cacheline.mlir b/test/gpu/symmetric_heap_dma/air_sym_handwritten_cacheline.mlir
new file mode 100644
index 000000000..5c65a6bd0
--- /dev/null
+++ b/test/gpu/symmetric_heap_dma/air_sym_handwritten_cacheline.mlir
@@ -0,0 +1,358 @@
+//===- air_sym_handwritten_cacheline.mlir - multi-GPU e2e (cache line) ----===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===------------------------------------------------------------------===//
+//
+// Symmetric-heap producer/consumer e2e (WORLD_SIZE=2), cache-line variant.
+// Sister file: air_sym_handwritten_atomic.mlir uses LLVM atomicrmw / atomic
+// load with syncscope("") for the cross-rank handoff.
+//
+//   rank 0 launches @producer; rank 1 launches @consumer.
+//
+// Message-passing via cache-line atomicity (no atomics, no fences)
+// ================================================================
+//
+// Assuming one cache line = 128 bytes = 32 i32:
+//
+//        ┌─────────────────────────────────────────────────────┐
+//        │                  128-byte cache line                │
+//        ├────┬────┬────┬────┬─── ··· ───┬────┬───────────────┤
+//  lane: │  0 │  1 │  2 │  3 │           │ 30 │  31 ◄── flag  │
+//        ├────┼────┼────┼────┤           ├────┼───────────────┤
+//  init: │  0 │  0 │  0 │  0 │    0 ···  │  0 │   0           │
+//        ├────┼────┼────┼────┤           ├────┼───────────────┤
+//  prod: │100 │101 │102 │103 │ lane+100  │130 │   1           │
+//        └────┴────┴────┴────┴─── ··· ───┴────┴───────────────┘
+//
+// Producer (rank 0, 1 wave × 64 lanes):
+//   data[lane] = (lane == 31) ? 1 : (lane + 100)   // single vec store
+//
+// Consumer (rank 1, 1 wave × 64 lanes), spin loop:
+//   v    = data[lane]                              // single vec load
+//   flag = gpu.shuffle idx v, lane=31, width=64   // broadcast lane 31's val
+//   if flag == 1: break, else retry
+//
+// Why this works on gfx940 / MI300:
+//   - Producer's vec-store commits the whole 128-byte cache line as one HW
+//     transaction; lane 31's "1" is published with the same coherence event
+//     as lanes 0..30's payload (the compiler cannot split a uniform vector
+//     store of 32 i32 into per-lane sub-stores).
+//   - The XGMI coherence fabric on MI300 publishes peer cache lines whole
+//     (not per-lane), so when consumer's lane 31 observes flag==1, lanes
+//     0..30 of the same line are guaranteed visible from this load.
+//   - shuffle-broadcast of the flag is wave-uniform, so all 64 lanes break
+//     in lockstep; no need for control-flow synchronization.
+//
+// Trade-off vs the previous LLVM-atomic design: this trades a spec-defined
+// ordering contract (atomicrmw release / atomic load acquire with
+// syncscope("") = AMDGPUUsage System) for a microarchitectural one. It is
+// simpler and matches how real GPU code does fast intra-rank handoff, but
+// the atomicity guarantee is not in the AMDGPU LangRef the way LLVM atomic
+// scopes are.
+//
+// Note on lanes 32..63: data is sized to one cache line (32 i32), so only
+// lanes 0..31 access it. Lanes 32..63 still participate in gpu.shuffle so
+// the shuffle stays wave-uniform; their loads are guarded by `lane < 32`.
+//
+// Launcher: run.sh forks N processes with RANK / WORLD_SIZE / LOCAL_RANK.
+//
+//===------------------------------------------------------------------===//
+
+module attributes {gpu.container_module} {
+  // ---- mgpu* C ABI declarations -----------------------------------------
+  func.func private @mgpuSymmetricHeapInit(i64)
+  func.func private @mgpuSymmetricHeapDestroy()
+  func.func private @mgpuGetRank() -> i32
+  func.func private @mgpuGetWorldSize() -> i32
+  func.func private @mgpuSymmetricAlloc(i64, !llvm.ptr) -> !llvm.ptr
+  func.func private @mgpuSymmetricFree(!llvm.ptr, !llvm.ptr)
+  func.func private @mgpuGetHeapBases() -> !llvm.ptr
+  func.func private @mgpuBarrier()
+  func.func private @mgpuMemAlloc(i64, !llvm.ptr, i1) -> !llvm.ptr
+  func.func private @mgpuMemFree(!llvm.ptr, !llvm.ptr)
+  func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr)
+
+  // libc exit — verify branch calls this on any mismatch so run.sh
+  // sees a non-zero process exit (no green-without-validation).
+  func.func private @exit(i32)
+
+  llvm.func @printf(!llvm.ptr, ...) -> i32
+
+  llvm.mlir.global internal constant @msg_init(
+      "[mlir] rank %d / world %d, init OK\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_pass_p(
+      "[mlir] rank 0 (producer): kernel returned\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_pass_c(
+      "[mlir] rank 1 (consumer): cache-line message PASS (data[0]=%d, flag=%d)\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_fail(
+      "[mlir] rank 1 (consumer): MISMATCH at idx=%ld got=%d expected=%d\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_done(
+      "[mlir] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32}
+
+  // ---- GPU kernels ------------------------------------------------------
+  gpu.module @sym_kernels {
+
+    // Producer: 1 wave × 64 lanes; lanes 0..31 write one cache line into
+    // peer's data buffer, with lane 31 == 1 (flag) and lanes 0..30 ==
+    // lane+100 (payload). Lanes 32..63 idle.
+    gpu.func @producer(%data : memref<32xi32>,
+                       %bases : memref<?xindex>) kernel
+                       attributes {gpu.known_block_size = array<i32: 64, 1, 1>,
+                                   gpu.known_grid_size  = array<i32: 1, 1, 1>} {
+      %c1_i32   = arith.constant 1   : i32
+      %c100_i32 = arith.constant 100 : i32
+      %c31      = arith.constant 31  : index
+      %c32      = arith.constant 32  : index
+      %from = arith.constant 0 : index   // rank 0 (producer)
+      %to   = arith.constant 1 : index   // rank 1 (consumer)
+
+      %tid = gpu.thread_id x
+      %active = arith.cmpi ult, %tid, %c32 : index
+      %peer_data = air.translate %data, %from, %to, %bases
+          : memref<32xi32>, memref<?xindex>
+
+      scf.if %active {
+        %is_flag  = arith.cmpi eq, %tid, %c31 : index
+        %tid_i32  = arith.index_cast %tid : index to i32
+        %payload  = arith.addi %tid_i32, %c100_i32 : i32
+        %val      = arith.select %is_flag, %c1_i32, %payload : i32
+        memref.store %val, %peer_data[%tid] : memref<32xi32>
+      }
+      gpu.return
+    }
+
+    // Consumer: 1 wave × 64 lanes; spin on local data (already peer-mapped
+    // by symmetric heap), broadcasting lane 31 via gpu.shuffle until it
+    // observes flag==1. Then lanes 0..31 store their loaded value into
+    // verify_buf for host check.
+    gpu.func @consumer(%data       : memref<32xi32>,
+                       %verify_buf : memref<32xi32>) kernel
+                       attributes {gpu.known_block_size = array<i32: 64, 1, 1>,
+                                   gpu.known_grid_size  = array<i32: 1, 1, 1>} {
+      %c0_i32  = arith.constant 0  : i32
+      %c1_i32  = arith.constant 1  : i32
+      %c31_i32 = arith.constant 31 : i32
+      %c64_i32 = arith.constant 64 : i32
+      %c32     = arith.constant 32 : index
+
+      %tid = gpu.thread_id x
+      %active = arith.cmpi ult, %tid, %c32 : index
+
+      // Spin loop: all 64 lanes participate so the shuffle stays uniform.
+      // Lanes 32..63 contribute a poison value to the shuffle (shfl reads
+      // lane 31, so their input is irrelevant) and do no memory work.
+      // The loop's exit predicate is wave-uniform (flag is a broadcast),
+      // so all lanes break together.
+      %final_v = scf.while (%dummy = %c0_i32) : (i32) -> i32 {
+        %v = scf.if %active -> i32 {
+          %loaded = memref.load %data[%tid] : memref<32xi32>
+          scf.yield %loaded : i32
+        } else {
+          scf.yield %c0_i32 : i32
+        }
+        %flag, %valid = gpu.shuffle idx %v, %c31_i32, %c64_i32 : i32
+        %not_ready = arith.cmpi ne, %flag, %c1_i32 : i32
+        scf.condition(%not_ready) %v : i32
+      } do {
+      ^bb0(%v_iter : i32):
+        scf.yield %v_iter : i32
+      }
+
+      scf.if %active {
+        memref.store %final_v, %verify_buf[%tid] : memref<32xi32>
+      }
+      gpu.return
+    }
+  }
+
+  // ---- Helpers ----------------------------------------------------------
+  // Single ABI-leaking helper: wrap a raw runtime !llvm.ptr as a 1-D byte
+  // memref. All typed views below derive from this via memref.view, so the
+  // hand-built LLVM-struct descriptor literal lives in exactly one place.
+  // Phase 4's AIRSymmetricAllocToMgpuPass will replace this entirely.
+  func.func private @wrap_bytes(%ptr : !llvm.ptr, %size : i64) -> memref<?xi8> {
+    %c0_i64 = arith.constant 0 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %d0 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %d1 = llvm.insertvalue %ptr,    %d0[0]    : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %d2 = llvm.insertvalue %ptr,    %d1[1]    : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %d3 = llvm.insertvalue %c0_i64, %d2[2]    : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %d4 = llvm.insertvalue %size,   %d3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %d5 = llvm.insertvalue %c1_i64, %d4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %m  = builtin.unrealized_conversion_cast %d5
+        : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<?xi8>
+    return %m : memref<?xi8>
+  }
+
+  // ---- main ------------------------------------------------------------
+  func.func @main() {
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %c128_bytes  = arith.constant 128 : i64       // 32 i32 = one cache line
+    %heap_size   = arith.constant 268435456 : i64 // 256 MB
+    %nullptr = llvm.mlir.zero : !llvm.ptr
+    %false = arith.constant false
+
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+
+    // Heap init (collective).
+    func.call @mgpuSymmetricHeapInit(%heap_size) : (i64) -> ()
+    %rank = func.call @mgpuGetRank() : () -> i32
+    %world = func.call @mgpuGetWorldSize() : () -> i32
+    %fmt_init = llvm.mlir.addressof @msg_init : !llvm.ptr
+    llvm.call @printf(%fmt_init, %rank, %world)
+        vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32) -> i32
+
+    // Single 128-byte symmetric allocation (32 i32 = one cache line).
+    %data_ptr  = func.call @mgpuSymmetricAlloc(%c128_bytes, %nullptr)
+        : (i64, !llvm.ptr) -> !llvm.ptr
+
+    // Zero-init data from host so the consumer's spin starts seeing flag=0
+    // (and so the validation can distinguish "never written" from "wrote 0").
+    %data_host = memref.alloc() : memref<32xi32>
+    %dc0 = arith.constant 0 : index
+    %dc1 = arith.constant 1 : index
+    %dc32 = arith.constant 32 : index
+    scf.for %i = %dc0 to %dc32 step %dc1 {
+      memref.store %c0_i32, %data_host[%i] : memref<32xi32>
+    }
+    %data_host_intptr = memref.extract_aligned_pointer_as_index %data_host
+        : memref<32xi32> -> index
+    %data_host_int = arith.index_cast %data_host_intptr : index to i64
+    %data_host_ptr = llvm.inttoptr %data_host_int : i64 to !llvm.ptr
+    func.call @mgpuMemcpy(%data_ptr, %data_host_ptr, %c128_bytes, %nullptr)
+        : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+    memref.dealloc %data_host : memref<32xi32>
+
+    func.call @mgpuBarrier() : () -> ()  // zero-init visible to all ranks
+
+    %c0_view = arith.constant 0 : index
+    %data_bytes = func.call @wrap_bytes(%data_ptr, %c128_bytes)
+        : (!llvm.ptr, i64) -> memref<?xi8>
+    %data_m = memref.view %data_bytes[%c0_view][]
+        : memref<?xi8> to memref<32xi32>
+
+    // mgpuGetHeapBases() returns a HOST pointer; GPU can't deref it, so
+    // copy to device. TODO(airgpu): make heap_bases device-accessible
+    // (hipMallocManaged / hipHostMalloc-Mapped) and drop this copy.
+    %world_i64 = arith.extui %world : i32 to i64
+    %c8_i64 = arith.constant 8 : i64
+    %bases_size = arith.muli %world_i64, %c8_i64 : i64
+    %bases_host = func.call @mgpuGetHeapBases() : () -> !llvm.ptr
+    %bases_devptr = func.call @mgpuMemAlloc(%bases_size, %nullptr, %false)
+        : (i64, !llvm.ptr, i1) -> !llvm.ptr
+    func.call @mgpuMemcpy(%bases_devptr, %bases_host, %bases_size, %nullptr)
+        : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+    %bases_bytes = func.call @wrap_bytes(%bases_devptr, %bases_size)
+        : (!llvm.ptr, i64) -> memref<?xi8>
+    %world_idx = arith.index_cast %world_i64 : i64 to index
+    %bases = memref.view %bases_bytes[%c0_view][%world_idx]
+        : memref<?xi8> to memref<?xindex>
+
+    // Rank 0 = producer, rank 1 = consumer. Ranks > 1 idle.
+    // (Future: extend to all-pairs producer/consumer mesh.)
+    // Precondition: world >= 2 — enforced by run.sh, not re-checked here.
+    %is_producer = arith.cmpi eq, %rank, %c0_i32 : i32
+    scf.if %is_producer {
+      gpu.launch_func @sym_kernels::@producer
+          blocks  in (%c1, %c1, %c1)
+          threads in (%c64, %c1, %c1)
+          args(%data_m : memref<32xi32>,
+               %bases  : memref<?xindex>)
+      %fmt_p = llvm.mlir.addressof @msg_pass_p : !llvm.ptr
+      llvm.call @printf(%fmt_p)
+          vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr) -> i32
+    } else {
+      %is_consumer = arith.cmpi eq, %rank, %c1_i32 : i32
+      scf.if %is_consumer {
+        %verify_ptr = func.call @mgpuMemAlloc(%c128_bytes, %nullptr, %false)
+            : (i64, !llvm.ptr, i1) -> !llvm.ptr
+        %verify_bytes = func.call @wrap_bytes(%verify_ptr, %c128_bytes)
+            : (!llvm.ptr, i64) -> memref<?xi8>
+        %verify_m = memref.view %verify_bytes[%c0_view][]
+            : memref<?xi8> to memref<32xi32>
+        gpu.launch_func @sym_kernels::@consumer
+            blocks  in (%c1, %c1, %c1)
+            threads in (%c64, %c1, %c1)
+            args(%data_m  : memref<32xi32>,
+                 %verify_m: memref<32xi32>)
+
+        // D2H readback verify_buf and check all 32 ints:
+        //   verify[i] == i + 100 for i in 0..30,
+        //   verify[31] == 1 (flag).
+        %hb = memref.alloc() : memref<32xi32>
+        %hb_intptr = memref.extract_aligned_pointer_as_index %hb
+            : memref<32xi32> -> index
+        %hb_int = arith.index_cast %hb_intptr : index to i64
+        %hb_ptr = llvm.inttoptr %hb_int : i64 to !llvm.ptr
+        func.call @mgpuMemcpy(%hb_ptr, %verify_ptr, %c128_bytes, %nullptr)
+            : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+
+        %c0_idx   = arith.constant 0   : index
+        %c1_idx   = arith.constant 1   : index
+        %c31_idx  = arith.constant 31  : index
+        %c32_idx  = arith.constant 32  : index
+        %c100_i32 = arith.constant 100 : i32
+
+        // Count mismatches; print msg_fail on the first.
+        %nfail = scf.for %i = %c0_idx to %c32_idx step %c1_idx
+                        iter_args(%nfail_acc = %c0_i32) -> (i32) {
+          %v = memref.load %hb[%i] : memref<32xi32>
+          %is_flag_idx = arith.cmpi eq, %i, %c31_idx : index
+          %expected = scf.if %is_flag_idx -> i32 {
+            scf.yield %c1_i32 : i32
+          } else {
+            %i_i32 = arith.index_cast %i : index to i32
+            %e = arith.addi %i_i32, %c100_i32 : i32
+            scf.yield %e : i32
+          }
+          %ne = arith.cmpi ne, %v, %expected : i32
+          %new_nfail = scf.if %ne -> i32 {
+            %is_first = arith.cmpi eq, %nfail_acc, %c0_i32 : i32
+            scf.if %is_first {
+              %fmt_fail = llvm.mlir.addressof @msg_fail : !llvm.ptr
+              %i_i64 = arith.index_cast %i : index to i64
+              llvm.call @printf(%fmt_fail, %rank, %i_i64, %v, %expected)
+                  vararg(!llvm.func<i32 (ptr, ...)>)
+                  : (!llvm.ptr, i32, i64, i32, i32) -> i32
+            }
+            %inc = arith.addi %nfail_acc, %c1_i32 : i32
+            scf.yield %inc : i32
+          } else {
+            scf.yield %nfail_acc : i32
+          }
+          scf.yield %new_nfail : i32
+        }
+
+        %ok_all = arith.cmpi eq, %nfail, %c0_i32 : i32
+        scf.if %ok_all {
+          %fmt_c = llvm.mlir.addressof @msg_pass_c : !llvm.ptr
+          %v0 = memref.load %hb[%c0_idx] : memref<32xi32>
+          %vf = memref.load %hb[%c31_idx] : memref<32xi32>
+          llvm.call @printf(%fmt_c, %v0, %vf)
+              vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32) -> i32
+        } else {
+          func.call @exit(%c1_i32) : (i32) -> ()
+        }
+
+        memref.dealloc %hb : memref<32xi32>
+        func.call @mgpuMemFree(%verify_ptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> ()
+      }
+    }
+
+    func.call @mgpuBarrier() : () -> ()
+    func.call @mgpuMemFree(%bases_devptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> ()
+    func.call @mgpuSymmetricFree(%data_ptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> ()
+    func.call @mgpuSymmetricHeapDestroy() : () -> ()
+
+    %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr
+    llvm.call @printf(%fmt_done, %rank)
+        vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32
+
+    return
+  }
+}
diff --git a/test/gpu/symmetric_heap_dma/air_sym_with_alloc.mlir b/test/gpu/symmetric_heap_dma/air_sym_with_alloc.mlir
new file mode 100644
index 000000000..5b0e892e3
--- /dev/null
+++ b/test/gpu/symmetric_heap_dma/air_sym_with_alloc.mlir
@@ -0,0 +1,122 @@
+//===- air_sym_with_alloc.mlir - air.rank + memref.alloc air.symmetric e2e ===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// Variant of air_sym_with_rank.mlir that uses `memref.alloc {air.symmetric}`
+// instead of a direct call to `mgpuSymmetricAlloc`. Exercises Phase 3
+// (`air-rank-to-mgpu`) AND Phase 4 (`air-symmetric-alloc-to-mgpu`).
+//
+// The symmetric memref is wrapped/unwrapped via the standard
+// `memref.extract_aligned_pointer_as_index` -> `llvm.inttoptr` idiom to
+// recover the !llvm.ptr that the runtime ABI expects.
+//
+//===-----------------------------------------------------------------------===//
+
+module {
+  func.func private @mgpuGetWorldSize() -> i32
+  func.func private @mgpuGetHeapBases() -> !llvm.ptr
+  func.func private @mgpuBarrier()
+  func.func private @mgpuMemAlloc(i64, !llvm.ptr, i1) -> !llvm.ptr
+  func.func private @mgpuMemFree(!llvm.ptr, !llvm.ptr)
+  func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr)
+  func.func private @malloc(i64) -> !llvm.ptr
+  func.func private @free(!llvm.ptr)
+  llvm.func @printf(!llvm.ptr, ...) -> i32
+
+  llvm.mlir.global internal constant @msg_pass("[mlir/alloc] rank %d: cross-rank read PASS (peer=%d, expected=%.1f)\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_only1("[mlir/alloc] rank %d: world_size=1, skipping cross-rank read\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_done("[mlir/alloc] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32}
+
+  func.func @main() {
+    %c2 = arith.constant 2 : index
+
+    air.rank (%rid) in (%rsize = %c2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c1_i32 = arith.constant 1 : i32
+      %c4096_i64 = arith.constant 4096 : i64
+      %nullptr = llvm.mlir.zero : !llvm.ptr
+      %false = arith.constant false
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c1024 = arith.constant 1024 : index
+
+      %rid_i64 = arith.index_cast %rid : index to i64
+      %rid_i32 = arith.trunci %rid_i64 : i64 to i32
+      %rsize_i64 = arith.index_cast %rsize : index to i64
+      %rsize_i32 = arith.trunci %rsize_i64 : i64 to i32
+
+      // === Phase 4 lowering target: memref.alloc {air.symmetric} ===
+      %buf_memref = memref.alloc() {air.symmetric} : memref<1024xf32>
+
+      // Extract the underlying pointer for use with the mgpu* runtime ABI.
+      // (Symmetric heap memory is GPU-only; CPU writes go through mgpuMemcpy.)
+      %intptr = memref.extract_aligned_pointer_as_index %buf_memref
+          : memref<1024xf32> -> index
+      %buf_int = arith.index_cast %intptr : index to i64
+      %buf = llvm.inttoptr %buf_int : i64 to !llvm.ptr
+
+      // Fill (rid+1).0 from a host buffer via mgpuMemcpy H2D.
+      %r1_i32 = arith.addi %rid_i32, %c1_i32 : i32
+      %r1_f = arith.sitofp %r1_i32 : i32 to f32
+      %hostbuf = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr
+      scf.for %i = %c0 to %c1024 step %c1 {
+        %i_i64 = arith.index_cast %i : index to i64
+        %addr = llvm.getelementptr %hostbuf[%i_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+        llvm.store %r1_f, %addr : f32, !llvm.ptr
+      }
+      func.call @mgpuMemcpy(%buf, %hostbuf, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+      func.call @free(%hostbuf) : (!llvm.ptr) -> ()
+
+      func.call @mgpuBarrier() : () -> ()
+
+      %is_multi = arith.cmpi sgt, %rsize_i32, %c1_i32 : i32
+      scf.if %is_multi {
+        %sum = arith.addi %rid_i32, %c1_i32 : i32
+        %peer_i32 = arith.remsi %sum, %rsize_i32 : i32
+        %bases = func.call @mgpuGetHeapBases() : () -> !llvm.ptr
+        %peer_i64 = arith.extsi %peer_i32 : i32 to i64
+        %peer_base_addr = llvm.getelementptr %bases[%peer_i64] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr
+        %peer_base = llvm.load %peer_base_addr : !llvm.ptr -> !llvm.ptr
+        %local_base_addr = llvm.getelementptr %bases[%rid_i64] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr
+        %local_base = llvm.load %local_base_addr : !llvm.ptr -> !llvm.ptr
+        %lb_int = llvm.ptrtoint %local_base : !llvm.ptr to i64
+        %offset = arith.subi %buf_int, %lb_int : i64
+        %peer_buf = llvm.getelementptr %peer_base[%offset] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+
+        %local_copy = func.call @mgpuMemAlloc(%c4096_i64, %nullptr, %false) : (i64, !llvm.ptr, i1) -> !llvm.ptr
+        func.call @mgpuMemcpy(%local_copy, %peer_buf, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+        %host_rb = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr
+        func.call @mgpuMemcpy(%host_rb, %local_copy, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+
+        %p1_i32 = arith.addi %peer_i32, %c1_i32 : i32
+        %expected = arith.sitofp %p1_i32 : i32 to f32
+        %c0_i64 = arith.constant 0 : i64
+        %addr0 = llvm.getelementptr %host_rb[%c0_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+        %v0 = llvm.load %addr0 : !llvm.ptr -> f32
+        %ok = arith.cmpf oeq, %v0, %expected : f32
+        scf.if %ok {
+          %fmt = llvm.mlir.addressof @msg_pass : !llvm.ptr
+          %e64 = arith.extf %expected : f32 to f64
+          llvm.call @printf(%fmt, %rid_i32, %peer_i32, %e64) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32, f64) -> i32
+        }
+
+        func.call @free(%host_rb) : (!llvm.ptr) -> ()
+        func.call @mgpuMemFree(%local_copy, %nullptr) : (!llvm.ptr, !llvm.ptr) -> ()
+      } else {
+        %fmt = llvm.mlir.addressof @msg_only1 : !llvm.ptr
+        llvm.call @printf(%fmt, %rid_i32) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32
+      }
+
+      func.call @mgpuBarrier() : () -> ()
+      memref.dealloc %buf_memref : memref<1024xf32>
+
+      %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr
+      llvm.call @printf(%fmt_done, %rid_i32) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32
+      air.rank_terminator
+    }
+    return
+  }
+}
diff --git a/test/gpu/symmetric_heap_dma/air_sym_with_channel.mlir b/test/gpu/symmetric_heap_dma/air_sym_with_channel.mlir
new file mode 100644
index 000000000..3f421db7d
--- /dev/null
+++ b/test/gpu/symmetric_heap_dma/air_sym_with_channel.mlir
@@ -0,0 +1,105 @@
+//===- air_sym_with_channel.mlir - air.channel gpu_symmetric_heap e2e ----===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// Highest-level form combining:
+//   - Phase 1: gpu_symmetric_heap channel_type, air.symmetric memref attribute
+//   - Phase 3: air-rank-to-mgpu (rank body inlining)
+//   - Phase 4: air-symmetric-alloc-to-mgpu (memref.alloc -> mgpuSymmetricAlloc)
+//   - Phase 6: air-gpu-channel-to-mgpu (gpu_symmetric_heap put/get -> peer-VA
+//              mgpuMemcpy + mgpuBarrier)
+//
+// Each rank fills a symmetric src buffer with (rank+1).0, publishes via
+// air.channel.put, and reads rank 0's slot via air.channel.get into a local
+// dst buffer. Both ranks should see 1.0 in dst[0].
+//
+//===-----------------------------------------------------------------------===//
+
+module {
+  func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr)
+  func.func private @malloc(i64) -> !llvm.ptr
+  func.func private @free(!llvm.ptr)
+  llvm.func @printf(!llvm.ptr, ...) -> i32
+
+  llvm.mlir.global internal constant @msg_pass("[mlir/chan] rank %d: channel get PASS (read rank 0 = %.1f)\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_done("[mlir/chan] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32}
+
+  // Channel decl at module scope (Symbol).
+  air.channel @sym_chan [] {channel_type = "gpu_symmetric_heap"}
+
+  func.func @main() {
+    %c2 = arith.constant 2 : index
+
+    air.rank (%rid) in (%rsize = %c2) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c1024 = arith.constant 1024 : index
+      %c1_i32 = arith.constant 1 : i32
+      %c4096_i64 = arith.constant 4096 : i64
+      %nullptr = llvm.mlir.zero : !llvm.ptr
+
+      %rid_i64 = arith.index_cast %rid : index to i64
+      %rid_i32 = arith.trunci %rid_i64 : i64 to i32
+
+      // Symmetric src buffer (each rank allocates same shape at same offset).
+      %src_buf = memref.alloc() {air.symmetric} : memref<1024xf32>
+      // Local non-symmetric destination.
+      %dst_buf = memref.alloc() {air.symmetric} : memref<1024xf32>
+
+      // Fill src_buf with (rid+1).0 from host.
+      %r1_i32 = arith.addi %rid_i32, %c1_i32 : i32
+      %r1_f = arith.sitofp %r1_i32 : i32 to f32
+      %hostbuf = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr
+      scf.for %i = %c0 to %c1024 step %c1 {
+        %i_i64 = arith.index_cast %i : index to i64
+        %addr = llvm.getelementptr %hostbuf[%i_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+        llvm.store %r1_f, %addr : f32, !llvm.ptr
+      }
+      %src_intptr = memref.extract_aligned_pointer_as_index %src_buf
+          : memref<1024xf32> -> index
+      %src_int = arith.index_cast %src_intptr : index to i64
+      %src_ptr = llvm.inttoptr %src_int : i64 to !llvm.ptr
+      func.call @mgpuMemcpy(%src_ptr, %hostbuf, %c4096_i64, %nullptr)
+          : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+
+      // === Phase 6 lowering target: gpu_symmetric_heap channel put/get ===
+      // put publishes our src_buf; get reads peer (rank 0) into dst_buf.
+      air.channel.put @sym_chan[] (%src_buf[] [] []) : (memref<1024xf32>)
+      air.channel.get @sym_chan[%c0] (%dst_buf[] [] []) : (memref<1024xf32>)
+
+      // Verify: D2H readback dst_buf to a host buffer, check element 0.
+      %dst_intptr = memref.extract_aligned_pointer_as_index %dst_buf
+          : memref<1024xf32> -> index
+      %dst_int = arith.index_cast %dst_intptr : index to i64
+      %dst_ptr = llvm.inttoptr %dst_int : i64 to !llvm.ptr
+      %host_rb = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr
+      func.call @mgpuMemcpy(%host_rb, %dst_ptr, %c4096_i64, %nullptr)
+          : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+      %c0_i64 = arith.constant 0 : i64
+      %addr0 = llvm.getelementptr %host_rb[%c0_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %v0 = llvm.load %addr0 : !llvm.ptr -> f32
+      %expected = arith.constant 1.0 : f32
+      %ok = arith.cmpf oeq, %v0, %expected : f32
+      scf.if %ok {
+        %fmt = llvm.mlir.addressof @msg_pass : !llvm.ptr
+        %v0_64 = arith.extf %v0 : f32 to f64
+        llvm.call @printf(%fmt, %rid_i32, %v0_64) vararg(!llvm.func<i32 (ptr, ...)>)
+            : (!llvm.ptr, i32, f64) -> i32
+      }
+      func.call @free(%host_rb) : (!llvm.ptr) -> ()
+      func.call @free(%hostbuf) : (!llvm.ptr) -> ()
+
+      memref.dealloc %dst_buf : memref<1024xf32>
+      memref.dealloc %src_buf : memref<1024xf32>
+
+      %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr
+      llvm.call @printf(%fmt_done, %rid_i32) vararg(!llvm.func<i32 (ptr, ...)>)
+          : (!llvm.ptr, i32) -> i32
+      air.rank_terminator
+    }
+    return
+  }
+}
diff --git a/test/gpu/symmetric_heap_dma/air_sym_with_dma.mlir b/test/gpu/symmetric_heap_dma/air_sym_with_dma.mlir
new file mode 100644
index 000000000..c5d2d9413
--- /dev/null
+++ b/test/gpu/symmetric_heap_dma/air_sym_with_dma.mlir
@@ -0,0 +1,109 @@
+//===- air_sym_with_dma.mlir - air.rank + air.dma_memcpy_nd cross-rank ----===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// Highest-level form of the symmetric-heap test. Combines:
+//   - Phase 1: air.symmetric memref attribute, src_rank attribute on
+//              air.dma_memcpy_nd
+//   - Phase 3: air-rank-to-mgpu (rank body inlining)
+//   - Phase 4: air-symmetric-alloc-to-mgpu (memref.alloc -> mgpuSymmetricAlloc)
+//   - Phase 5: air-cross-rank-dma-to-mgpu (cross-rank dma -> peer-VA mgpuMemcpy)
+//
+// Each rank allocates two symmetric buffers (src and dst), fills its src with
+// (rank+1).0, then issues a cross-rank DMA reading rank 0's src into its
+// own dst, and verifies dst contains 1.0 on every rank.
+//
+//===-----------------------------------------------------------------------===//
+
+module {
+  func.func private @mgpuBarrier()
+  func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr)
+  func.func private @malloc(i64) -> !llvm.ptr
+  func.func private @free(!llvm.ptr)
+  llvm.func @printf(!llvm.ptr, ...) -> i32
+
+  llvm.mlir.global internal constant @msg_pass("[mlir/dma] rank %d: cross-rank DMA PASS (read rank 0 = %.1f)\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_done("[mlir/dma] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32}
+
+  func.func @main() {
+    %c2 = arith.constant 2 : index
+
+    air.rank (%rid) in (%rsize = %c2) {
+      %c1_i32 = arith.constant 1 : i32
+      %c4096_i64 = arith.constant 4096 : i64
+      %nullptr = llvm.mlir.zero : !llvm.ptr
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c1024 = arith.constant 1024 : index
+
+      %rid_i64 = arith.index_cast %rid : index to i64
+      %rid_i32 = arith.trunci %rid_i64 : i64 to i32
+
+      // Two symmetric buffers per rank (collective allocation).
+      %src_buf = memref.alloc() {air.symmetric} : memref<1024xf32>
+      %dst_buf = memref.alloc() {air.symmetric} : memref<1024xf32>
+
+      // Get pointers for the H2D init (and later D2H verification).
+      %src_intptr = memref.extract_aligned_pointer_as_index %src_buf
+          : memref<1024xf32> -> index
+      %src_int = arith.index_cast %src_intptr : index to i64
+      %src_ptr = llvm.inttoptr %src_int : i64 to !llvm.ptr
+
+      %dst_intptr = memref.extract_aligned_pointer_as_index %dst_buf
+          : memref<1024xf32> -> index
+      %dst_int = arith.index_cast %dst_intptr : index to i64
+      %dst_ptr = llvm.inttoptr %dst_int : i64 to !llvm.ptr
+
+      // Fill src_buf with (rid+1).0 via host buffer + mgpuMemcpy H2D.
+      %r1_i32 = arith.addi %rid_i32, %c1_i32 : i32
+      %r1_f = arith.sitofp %r1_i32 : i32 to f32
+      %hostbuf = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr
+      scf.for %i = %c0 to %c1024 step %c1 {
+        %i_i64 = arith.index_cast %i : index to i64
+        %addr = llvm.getelementptr %hostbuf[%i_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+        llvm.store %r1_f, %addr : f32, !llvm.ptr
+      }
+      func.call @mgpuMemcpy(%src_ptr, %hostbuf, %c4096_i64, %nullptr)
+          : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+      func.call @mgpuBarrier() : () -> ()
+
+      // === Phase 5 lowering target: cross-rank air.dma_memcpy_nd ===
+      // Both ranks read from rank 0's src_buf into their own dst_buf.
+      air.dma_memcpy_nd (%dst_buf[] [] [], %src_buf[] [] [])
+          {src_rank = 0 : i64}
+          : (memref<1024xf32>, memref<1024xf32>)
+
+      // Verify: D2H readback dst_buf to a host buffer, check element 0.
+      // On every rank, dst_buf should contain (rank0 + 1).0 == 1.0.
+      %host_rb = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr
+      func.call @mgpuMemcpy(%host_rb, %dst_ptr, %c4096_i64, %nullptr)
+          : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+      %c0_i64 = arith.constant 0 : i64
+      %addr0 = llvm.getelementptr %host_rb[%c0_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %v0 = llvm.load %addr0 : !llvm.ptr -> f32
+      %expected = arith.constant 1.0 : f32
+      %ok = arith.cmpf oeq, %v0, %expected : f32
+      scf.if %ok {
+        %fmt = llvm.mlir.addressof @msg_pass : !llvm.ptr
+        %v0_64 = arith.extf %v0 : f32 to f64
+        llvm.call @printf(%fmt, %rid_i32, %v0_64) vararg(!llvm.func<i32 (ptr, ...)>)
+            : (!llvm.ptr, i32, f64) -> i32
+      }
+      func.call @free(%host_rb) : (!llvm.ptr) -> ()
+
+      func.call @mgpuBarrier() : () -> ()
+      func.call @free(%hostbuf) : (!llvm.ptr) -> ()
+      memref.dealloc %dst_buf : memref<1024xf32>
+      memref.dealloc %src_buf : memref<1024xf32>
+
+      %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr
+      llvm.call @printf(%fmt_done, %rid_i32) vararg(!llvm.func<i32 (ptr, ...)>)
+          : (!llvm.ptr, i32) -> i32
+      air.rank_terminator
+    }
+    return
+  }
+}
diff --git a/test/gpu/symmetric_heap_dma/air_sym_with_rank.mlir b/test/gpu/symmetric_heap_dma/air_sym_with_rank.mlir
new file mode 100644
index 000000000..cf5416347
--- /dev/null
+++ b/test/gpu/symmetric_heap_dma/air_sym_with_rank.mlir
@@ -0,0 +1,122 @@
+//===- air_sym_with_rank.mlir - High-level air.rank multi-GPU e2e --------===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===-----------------------------------------------------------------------===//
+//
+// Higher-level version of air_sym_handwritten.mlir that uses `air.rank` to
+// express the multi-process world. The `air-rank-to-mgpu` pass lowers
+// air.rank to inline body + mgpuGetRank() / mgpuSymmetricHeapInit / Destroy.
+//
+// Once lowered, the IR matches air_sym_handwritten.mlir's behavior. After
+// `mlir-opt --pass-pipeline=...`, both forms should run identically under
+// the multi-process driver run.sh.
+//
+//===-----------------------------------------------------------------------===//
+
+module {
+  // ---- mgpu* C ABI declarations --------------------------------------
+  func.func private @mgpuGetRank() -> i32
+  func.func private @mgpuGetWorldSize() -> i32
+  func.func private @mgpuSymmetricAlloc(i64, !llvm.ptr) -> !llvm.ptr
+  func.func private @mgpuSymmetricFree(!llvm.ptr, !llvm.ptr)
+  func.func private @mgpuGetHeapBases() -> !llvm.ptr
+  func.func private @mgpuBarrier()
+  func.func private @mgpuMemAlloc(i64, !llvm.ptr, i1) -> !llvm.ptr
+  func.func private @mgpuMemFree(!llvm.ptr, !llvm.ptr)
+  func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr)
+
+  // libc helpers
+  func.func private @malloc(i64) -> !llvm.ptr
+  func.func private @free(!llvm.ptr)
+  llvm.func @printf(!llvm.ptr, ...) -> i32
+
+  llvm.mlir.global internal constant @msg_pass("[mlir/rank] rank %d: cross-rank read PASS (peer=%d, expected=%.1f)\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_only1("[mlir/rank] rank %d: world_size=1, skipping cross-rank read\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @msg_done("[mlir/rank] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32}
+
+  func.func @main() {
+    %c2 = arith.constant 2 : index
+
+    // High-level: a 2-rank world. The body executes once per rank.
+    air.rank (%rid) in (%rsize = %c2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c1_i32 = arith.constant 1 : i32
+      %c4096_i64 = arith.constant 4096 : i64
+      %nullptr = llvm.mlir.zero : !llvm.ptr
+      %false = arith.constant false
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c1024 = arith.constant 1024 : index
+
+      // Convert rank id (index) to i32 for printf and arithmetic.
+      %rid_i64 = arith.index_cast %rid : index to i64
+      %rid_i32 = arith.trunci %rid_i64 : i64 to i32
+      %rsize_i64 = arith.index_cast %rsize : index to i64
+      %rsize_i32 = arith.trunci %rsize_i64 : i64 to i32
+
+      %buf = func.call @mgpuSymmetricAlloc(%c4096_i64, %nullptr) : (i64, !llvm.ptr) -> !llvm.ptr
+
+      // Fill buf with (rank+1).0 from host
+      %hostbuf = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr
+      %r1_i32 = arith.addi %rid_i32, %c1_i32 : i32
+      %r1_f = arith.sitofp %r1_i32 : i32 to f32
+      scf.for %i = %c0 to %c1024 step %c1 {
+        %i_i64 = arith.index_cast %i : index to i64
+        %addr = llvm.getelementptr %hostbuf[%i_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+        llvm.store %r1_f, %addr : f32, !llvm.ptr
+      }
+      func.call @mgpuMemcpy(%buf, %hostbuf, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+      func.call @mgpuBarrier() : () -> ()
+
+      %is_multi = arith.cmpi sgt, %rsize_i32, %c1_i32 : i32
+      scf.if %is_multi {
+        %sum = arith.addi %rid_i32, %c1_i32 : i32
+        %peer_i32 = arith.remsi %sum, %rsize_i32 : i32
+        %bases = func.call @mgpuGetHeapBases() : () -> !llvm.ptr
+        %peer_i64 = arith.extsi %peer_i32 : i32 to i64
+        %peer_base_addr = llvm.getelementptr %bases[%peer_i64] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr
+        %peer_base = llvm.load %peer_base_addr : !llvm.ptr -> !llvm.ptr
+        %local_base_addr = llvm.getelementptr %bases[%rid_i64] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr
+        %local_base = llvm.load %local_base_addr : !llvm.ptr -> !llvm.ptr
+        %buf_int = llvm.ptrtoint %buf : !llvm.ptr to i64
+        %lb_int = llvm.ptrtoint %local_base : !llvm.ptr to i64
+        %offset = arith.subi %buf_int, %lb_int : i64
+        %peer_buf = llvm.getelementptr %peer_base[%offset] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+
+        %local_copy = func.call @mgpuMemAlloc(%c4096_i64, %nullptr, %false) : (i64, !llvm.ptr, i1) -> !llvm.ptr
+        func.call @mgpuMemcpy(%local_copy, %peer_buf, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+        %host_rb = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr
+        func.call @mgpuMemcpy(%host_rb, %local_copy, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
+
+        %p1_i32 = arith.addi %peer_i32, %c1_i32 : i32
+        %expected = arith.sitofp %p1_i32 : i32 to f32
+        %c0_i64 = arith.constant 0 : i64
+        %addr0 = llvm.getelementptr %host_rb[%c0_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+        %v0 = llvm.load %addr0 : !llvm.ptr -> f32
+        %ok = arith.cmpf oeq, %v0, %expected : f32
+        scf.if %ok {
+          %fmt = llvm.mlir.addressof @msg_pass : !llvm.ptr
+          %e64 = arith.extf %expected : f32 to f64
+          llvm.call @printf(%fmt, %rid_i32, %peer_i32, %e64) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32, f64) -> i32
+        }
+
+        func.call @free(%host_rb) : (!llvm.ptr) -> ()
+        func.call @mgpuMemFree(%local_copy, %nullptr) : (!llvm.ptr, !llvm.ptr) -> ()
+      } else {
+        %fmt = llvm.mlir.addressof @msg_only1 : !llvm.ptr
+        llvm.call @printf(%fmt, %rid_i32) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32
+      }
+
+      func.call @mgpuBarrier() : () -> ()
+      func.call @free(%hostbuf) : (!llvm.ptr) -> ()
+      func.call @mgpuSymmetricFree(%buf, %nullptr) : (!llvm.ptr, !llvm.ptr) -> ()
+
+      %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr
+      llvm.call @printf(%fmt_done, %rid_i32) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32
+      air.rank_terminator
+    }
+    return
+  }
+}
diff --git a/test/gpu/symmetric_heap_dma/run.sh b/test/gpu/symmetric_heap_dma/run.sh
new file mode 100755
index 000000000..9067bc841
--- /dev/null
+++ b/test/gpu/symmetric_heap_dma/run.sh
@@ -0,0 +1,162 @@
+#!/usr/bin/env bash
+#===- run.sh - Multi-process symmetric-heap DMA e2e test --*-
+#
+# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+#===------------------------------------------------------------------===//
+#
+# Compile and run the hand-written symmetric-heap MLIR test as N processes.
+# Each process executes the full IR; processes coordinate via the symmetric
+# heap (XGMI peer-mapped VMem buffers).
+#
+# Usage: run.sh [num_ranks]   (default: 2)
+#
+# Required environment (auto-detected when sourced via env_setup_gpu.sh):
+#   MLIR_AIR_INSTALL_DIR  - path containing lib/libairgpu.so
+#   LLVM_INSTALL_DIR      - path containing bin/mlir-opt + lib/libmlir_*.so
+#
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+NUM_RANKS=${1:-2}
+TMPDIR="${TMPDIR:-/tmp/air_sym_dma}"
+mkdir -p "$TMPDIR"
+
+# Cross-rank symmetric-heap test fundamentally requires a producer + a
+# consumer process. Refuse single-process launches loudly rather than
+# letting the kernel silently no-op or hang.
+if [ "$NUM_RANKS" -lt 2 ]; then
+  echo "ERROR: NUM_RANKS=$NUM_RANKS; this test requires >= 2 ranks (producer + consumer)." >&2
+  exit 1
+fi
+
+# Refuse to run if there aren't enough physically distinct GPUs for one
+# rank per GPU. Colocating ranks on a single GPU would make XGMI/peer-VA
+# transparently fall back to local memory and produce false-positive PASSes.
+if [ -n "${HIP_VISIBLE_DEVICES:-}" ]; then
+  NUM_GPUS=$(echo "$HIP_VISIBLE_DEVICES" | tr ',' '\n' | grep -c .)
+else
+  NUM_GPUS=$(grep -l '^simd_count [1-9]' /sys/class/kfd/kfd/topology/nodes/*/properties 2>/dev/null | wc -l)
+fi
+if [ "$NUM_GPUS" -lt "$NUM_RANKS" ]; then
+  echo "ERROR: need >= $NUM_RANKS GPUs to validate cross-rank XGMI traffic; found $NUM_GPUS." >&2
+  echo "       This test refuses to colocate ranks on a single GPU because it would" >&2
+  echo "       silently bypass the symmetric-heap path and report false PASSes." >&2
+  exit 1
+fi
+
+LLVM_LIB_DIR="${LLVM_INSTALL_DIR:-$(dirname "$(which mlir-opt)")/..}/lib"
+AIRGPU_LIB="${MLIR_AIR_INSTALL_DIR:-$(dirname "$(which air-opt)")/..}/lib/libairgpu.so"
+
+# Input MLIR can be selected via INPUT env var.
+#   atomic    — kernel-driven producer/consumer, LLVM atomicrmw + atomic
+#               load with syncscope("") (Phase 2)
+#   cacheline — kernel-driven producer/consumer, cache-line atomicity +
+#               gpu.shuffle (Phase 2)
+#   rank      — high-level air.rank form (Phase 3)
+INPUT="${INPUT:-cacheline}"
+case "$INPUT" in
+  atomic|cacheline)
+    # Kernel-driven test: needs the full GPU compilation chain
+    # (rocdl-attach-target → convert-gpu-to-rocdl → gpu-module-to-binary).
+    SRC_MLIR="$SCRIPT_DIR/air_sym_handwritten_${INPUT}.mlir"
+    echo "Step 1a: Expand air.translate ops ($INPUT variant)"
+    air-opt "$SRC_MLIR" --air-translate-to-llvm \
+        -o "$TMPDIR/sym_post_translate.mlir"
+    echo "Step 1b: Compile gpu.module to AMDGPU binary + finalize host"
+    mlir-opt "$TMPDIR/sym_post_translate.mlir" \
+        --pass-pipeline='builtin.module(rocdl-attach-target{chip=gfx942 O=3},gpu.module(convert-scf-to-cf,convert-gpu-to-rocdl{chipset=gfx942 runtime=HIP},reconcile-unrealized-casts),gpu-module-to-binary,func.func(gpu-async-region,convert-scf-to-cf),gpu-to-llvm,convert-to-llvm,reconcile-unrealized-casts)' \
+        -o "$TMPDIR/sym_lowered.mlir"
+    SKIP_LOWER=1
+    ;;
+  rank)
+    # Host-orchestrated test: simple LLVM-only pipeline.
+    echo "Step 1a: Lower air.rank to mgpu*"
+    air-opt "$SCRIPT_DIR/air_sym_with_rank.mlir" -air-rank-to-mgpu \
+        -o "$TMPDIR/post_rank.mlir"
+    echo "Step 1b: Lower IR to LLVM dialect"
+    mlir-opt "$TMPDIR/post_rank.mlir" \
+        --pass-pipeline='builtin.module(func.func(convert-scf-to-cf),convert-to-llvm,reconcile-unrealized-casts)' \
+        -o "$TMPDIR/sym_lowered.mlir"
+    SKIP_LOWER=1
+    ;;
+  alloc)
+    SRC="$SCRIPT_DIR/air_sym_with_alloc.mlir"
+    # Phase 4 alloc lowering, then Phase 3 rank lowering, then standard LLVM.
+    air-opt "$SRC" -air-symmetric-alloc-to-mgpu -air-rank-to-mgpu \
+        -o "$TMPDIR/post_phase4.mlir"
+    SRC="$TMPDIR/post_phase4.mlir"
+    PIPE='builtin.module(func.func(convert-scf-to-cf),convert-to-llvm,reconcile-unrealized-casts)'
+    ;;
+  dma)
+    SRC="$SCRIPT_DIR/air_sym_with_dma.mlir"
+    # Phase 5 cross-rank DMA, Phase 4 alloc, Phase 3 rank, then standard LLVM.
+    air-opt "$SRC" -air-cross-rank-dma-to-mgpu -air-symmetric-alloc-to-mgpu \
+        -air-rank-to-mgpu -o "$TMPDIR/post_phase5.mlir"
+    SRC="$TMPDIR/post_phase5.mlir"
+    PIPE='builtin.module(func.func(convert-scf-to-cf),convert-to-llvm,reconcile-unrealized-casts)'
+    ;;
+  channel)
+    SRC="$SCRIPT_DIR/air_sym_with_channel.mlir"
+    # Phase 6 channel, Phase 4 alloc, Phase 3 rank, then standard LLVM.
+    air-opt "$SRC" -air-gpu-channel-to-mgpu -air-symmetric-alloc-to-mgpu \
+        -air-rank-to-mgpu -o "$TMPDIR/post_phase6.mlir"
+    SRC="$TMPDIR/post_phase6.mlir"
+    PIPE='builtin.module(func.func(convert-scf-to-cf),convert-to-llvm,reconcile-unrealized-casts)'
+    ;;
+  prelowered)
+    # Pre-lowered MLIR file (e.g., output of `aircc --multi-gpu`).
+    # Path provided via SRC=path env var; bypass step 1.
+    if [ -z "${SRC:-}" ]; then
+      echo "INPUT=prelowered requires SRC=<path-to-lowered.mlir>" >&2
+      exit 1
+    fi
+    cp "$SRC" "$TMPDIR/sym_lowered.mlir"
+    SKIP_LOWER=1
+    ;;
+  *)
+    echo "Unknown INPUT=$INPUT; expected 'atomic', 'cacheline', 'rank', 'alloc', 'dma', 'channel', or 'prelowered'" >&2; exit 1;;
+esac
+
+if [ -z "${SKIP_LOWER:-}" ]; then
+  echo "Step 1c: Lower IR to LLVM dialect (INPUT=$INPUT)"
+  mlir-opt "$SRC" --pass-pipeline="$PIPE" -o "$TMPDIR/sym_lowered.mlir"
+fi
+
+echo "Step 2: Run as ${NUM_RANKS} processes"
+export AIRGPU_JOB_ID="${AIRGPU_JOB_ID:-$$}"
+
+PIDS=()
+PASS=1
+
+for i in $(seq 0 $((NUM_RANKS - 1))); do
+  (set -o pipefail
+   # Pin each process to its own GPU at the OS / HIP-visibility level.
+   # mlir-runner's built-in gpu.launch_func handler (and any nested call
+   # into libmlir_rocm_runtime.so) only ever sees one device, so it can't
+   # accidentally launch on the wrong one. Every rank still sees device 0
+   # internally, so airgpu uses LOCAL_RANK=0.
+   RANK=$i WORLD_SIZE=$NUM_RANKS LOCAL_RANK=0 HIP_VISIBLE_DEVICES=$i \
+   mlir-runner --entry-point-result=void \
+       --shared-libs="$LLVM_LIB_DIR/libmlir_rocm_runtime.so" \
+       --shared-libs="$AIRGPU_LIB" \
+       --shared-libs="$LLVM_LIB_DIR/libmlir_runner_utils.so" \
+       --shared-libs="$LLVM_LIB_DIR/libmlir_c_runner_utils.so" \
+       "$TMPDIR/sym_lowered.mlir" 2>&1 | sed "s/^/[rank $i] /") &
+  PIDS+=($!)
+done
+
+for pid in "${PIDS[@]}"; do
+  if ! wait "$pid"; then
+    PASS=0
+  fi
+done
+
+if [ $PASS -eq 1 ]; then
+  echo "=== ALL ${NUM_RANKS} RANKS PASSED ==="
+else
+  echo "=== SOME RANKS FAILED ==="
+  exit 1
+fi
diff --git a/tools/aircc/aircc.cpp b/tools/aircc/aircc.cpp
index 8bb7fbad5..3401afb51 100644
--- a/tools/aircc/aircc.cpp
+++ b/tools/aircc/aircc.cpp
@@ -179,6 +179,16 @@ static cl::opt<std::string>
                cl::desc("GPU runtime for ROCDL target (HIP or OpenCL)"),
                cl::init("HIP"), cl::cat(airCompilerOptions));
 
+static cl::opt<bool> multiGpu(
+    "multi-gpu",
+    cl::desc(
+        "When --target=gpu, lower air.rank / air.symmetric memref / cross-rank "
+        "air.dma_memcpy_nd / gpu_symmetric_heap air.channel ops to mgpu* "
+        "runtime calls. Produces host-only LLVM IR; the result must be run "
+        "as N processes (RANK / WORLD_SIZE / LOCAL_RANK env vars) linked "
+        "against libairgpu.so. See test/gpu/symmetric_heap_dma/run.sh."),
+    cl::init(false), cl::cat(airCompilerOptions));
+
 static cl::opt<bool>
     omitWhileTrueLoop("omit-while-true-loop",
                       cl::desc("Do not add while(true) loop around per-core "
@@ -707,6 +717,72 @@ static OwningOpRef<ModuleOp> cloneModule(ModuleOp moduleOp) {
 // GPU Compilation Pipeline
 //===----------------------------------------------------------------------===//
 
+// Multi-GPU host-only compilation pipeline. Lowers the high-level multi-GPU
+// abstractions (air.rank, air.symmetric memref, cross-rank air.dma_memcpy_nd,
+// gpu_symmetric_heap air.channel) to mgpu* runtime calls + standard LLVM.
+// Output is host-only LLVM IR meant to be run as N processes via mlir-runner
+// with RANK / WORLD_SIZE / LOCAL_RANK env vars set.
+static LogicalResult runMultiGpuCompilation() {
+  SmallString<256> baseName(sys::path::stem(inputFilename));
+
+  auto airOpt = sys::findProgramByName("air-opt");
+  auto mlirOpt = sys::findProgramByName("mlir-opt");
+  if (!airOpt) {
+    llvm::errs() << "Error: could not find air-opt in PATH\n";
+    return failure();
+  }
+  if (!mlirOpt) {
+    llvm::errs() << "Error: could not find mlir-opt in PATH\n";
+    return failure();
+  }
+
+  if (verbose) {
+    llvm::outs() << "Multi-GPU compilation for " << inputFilename << "\n";
+    llvm::outs() << "  Tmpdir: " << tmpDir << "\n";
+  }
+
+  // Step 1: Lower multi-GPU abstractions to mgpu* runtime calls.
+  // Order: cross-rank-DMA / channel first (they reference air.symmetric
+  // allocs that survive Phase 4), then symmetric-alloc, then rank.
+  SmallString<256> step1(tmpDir);
+  sys::path::append(step1, baseName + "_mgpu.mlir");
+  if (failed(runCommand({*airOpt, inputFilename,
+                          "-air-cross-rank-dma-to-mgpu",
+                          "-air-gpu-channel-to-mgpu",
+                          "-air-symmetric-alloc-to-mgpu",
+                          "-air-rank-to-mgpu", "-o", step1.str().str()})))
+    return failure();
+
+  // Step 2: Standard LLVM lowering.
+  std::string finalOutput;
+  if (!outputFilename.empty()) {
+    finalOutput = outputFilename;
+  } else {
+    SmallString<256> tmp(tmpDir);
+    sys::path::append(tmp, baseName + "_final.mlir");
+    finalOutput = tmp.str().str();
+  }
+  std::string llvmPipeline =
+      "--pass-pipeline=builtin.module(func.func(convert-scf-to-cf),"
+      "convert-to-llvm,reconcile-unrealized-casts)";
+  if (failed(runCommand(
+          {*mlirOpt, step1.str().str(), llvmPipeline, "-o", finalOutput})))
+    return failure();
+
+  if (verbose)
+    llvm::outs() << "Multi-GPU compilation complete! Output: " << finalOutput
+                 << "\n"
+                 << "Run with: bash test/gpu/symmetric_heap_dma/run.sh "
+                    "(RANK/WORLD_SIZE/LOCAL_RANK env vars per process)\n";
+
+  if (outputFilename.empty()) {
+    auto bufOrErr = MemoryBuffer::getFile(finalOutput);
+    if (bufOrErr)
+      llvm::outs() << (*bufOrErr)->getBuffer();
+  }
+  return success();
+}
+
 static LogicalResult runGpuCompilation() {
   SmallString<256> baseName(sys::path::stem(inputFilename));
 
@@ -1675,6 +1751,8 @@ int main(int argc, char **argv) {
 
   // Dispatch based on target
   if (target.getValue() == "gpu") {
+    if (multiGpu)
+      return failed(runMultiGpuCompilation()) ? 1 : 0;
     return failed(runGpuCompilation()) ? 1 : 0;
   } else {
     return failed(runAieCompilation()) ? 1 : 0;