diff --git a/mlir/include/air/Conversion/AIRCrossRankDmaToMgpuPass.h b/mlir/include/air/Conversion/AIRCrossRankDmaToMgpuPass.h new file mode 100644 index 000000000..f3b55cad3 --- /dev/null +++ b/mlir/include/air/Conversion/AIRCrossRankDmaToMgpuPass.h @@ -0,0 +1,22 @@ +//===- AIRCrossRankDmaToMgpuPass.h ------------------------------*- C++ -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// + +#ifndef AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H +#define AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H + +#include "mlir/Pass/Pass.h" +#include + +namespace xilinx { +namespace air { + +std::unique_ptr createAIRCrossRankDmaToMgpuPass(); + +} // namespace air +} // namespace xilinx + +#endif // AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H diff --git a/mlir/include/air/Conversion/AIRGpuChannelToMgpuPass.h b/mlir/include/air/Conversion/AIRGpuChannelToMgpuPass.h new file mode 100644 index 000000000..2c9cae589 --- /dev/null +++ b/mlir/include/air/Conversion/AIRGpuChannelToMgpuPass.h @@ -0,0 +1,22 @@ +//===- AIRGpuChannelToMgpuPass.h --------------------------------*- C++ -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// + +#ifndef AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H +#define AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H + +#include "mlir/Pass/Pass.h" +#include + +namespace xilinx { +namespace air { + +std::unique_ptr createAIRGpuChannelToMgpuPass(); + +} // namespace air +} // namespace xilinx + +#endif // AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H diff --git a/mlir/include/air/Conversion/AIRRankToMgpuPass.h b/mlir/include/air/Conversion/AIRRankToMgpuPass.h new file mode 100644 index 000000000..cd19021bd --- /dev/null +++ b/mlir/include/air/Conversion/AIRRankToMgpuPass.h @@ -0,0 +1,22 @@ +//===- AIRRankToMgpuPass.h ---------------------------------------*- C++ -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// + +#ifndef AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H +#define AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H + +#include "mlir/Pass/Pass.h" +#include + +namespace xilinx { +namespace air { + +std::unique_ptr createAIRRankToMgpuPass(); + +} // namespace air +} // namespace xilinx + +#endif // AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H diff --git a/mlir/include/air/Conversion/AIRSymmetricAllocToMgpuPass.h b/mlir/include/air/Conversion/AIRSymmetricAllocToMgpuPass.h new file mode 100644 index 000000000..3168dcfbf --- /dev/null +++ b/mlir/include/air/Conversion/AIRSymmetricAllocToMgpuPass.h @@ -0,0 +1,22 @@ +//===- AIRSymmetricAllocToMgpuPass.h ----------------------------*- C++ -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// + +#ifndef AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H +#define AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H + +#include "mlir/Pass/Pass.h" +#include + +namespace xilinx { +namespace air { + +std::unique_ptr createAIRSymmetricAllocToMgpuPass(); + +} // namespace air +} // namespace xilinx + +#endif // AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H diff --git a/mlir/include/air/Conversion/AIRTranslateToLLVMPass.h b/mlir/include/air/Conversion/AIRTranslateToLLVMPass.h new file mode 100644 index 000000000..b07830787 --- /dev/null +++ b/mlir/include/air/Conversion/AIRTranslateToLLVMPass.h @@ -0,0 +1,22 @@ +//===- AIRTranslateToLLVMPass.h --------------------------------*- C++ -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// + +#ifndef AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H +#define AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H + +#include "mlir/Pass/Pass.h" +#include + +namespace xilinx { +namespace air { + +std::unique_ptr createAIRTranslateToLLVMPass(); + +} // namespace air +} // namespace xilinx + +#endif // AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H diff --git a/mlir/include/air/Conversion/GPUPassDetail.h b/mlir/include/air/Conversion/GPUPassDetail.h index bcf944587..4cf0e1ab8 100644 --- a/mlir/include/air/Conversion/GPUPassDetail.h +++ b/mlir/include/air/Conversion/GPUPassDetail.h @@ -23,8 +23,13 @@ namespace air { using namespace mlir; #define GEN_PASS_DECL +#define GEN_PASS_DEF_AIRTRANSLATETOLLVM #define GEN_PASS_DEF_CONVERTAIRTOROCDL #define GEN_PASS_DEF_CONVERTGPUKERNELOUTLINE +#define GEN_PASS_DEF_AIRRANKTOMGPU +#define GEN_PASS_DEF_AIRSYMMETRICALLOCTOMGPU +#define GEN_PASS_DEF_AIRCROSSRANKDMATOMGPU +#define GEN_PASS_DEF_AIRGPUCHANNELTOMGPU #include "air/Conversion/GPUPasses.h.inc" } // namespace air diff --git a/mlir/include/air/Conversion/GPUPasses.td b/mlir/include/air/Conversion/GPUPasses.td index ae846cf12..056104bc2 100644 --- a/mlir/include/air/Conversion/GPUPasses.td +++ b/mlir/include/air/Conversion/GPUPasses.td @@ -21,6 +21,23 @@ def ConvertAIRToROCDL : Pass<"air-to-rocdl", "ModuleOp"> { let options = []; } +def AIRTranslateToLLVM : Pass<"air-translate-to-llvm", "ModuleOp"> { + let summary = "Lower air.translate to memref.reinterpret_cast + LLVM-dialect address arithmetic"; + let description = [{ + Expands each `air.translate` op into the pointer-rebase computation: + `bases[to_rank] - bases[from_rank]`, converted from bytes to elements + of the source memref's element type, then applied as a new offset + via `memref.reinterpret_cast`. The expansion is pure arithmetic; it + works identically on host functions and inside `gpu.func`. + }]; + let constructor = "xilinx::air::createAIRTranslateToLLVMPass()"; + let dependentDialects = [ + "mlir::arith::ArithDialect", + "mlir::memref::MemRefDialect", + "mlir::LLVM::LLVMDialect" + ]; +} + def ConvertGPUKernelOutline : Pass<"air-gpu-outlining", "ModuleOp"> { let summary = "Outline GPU Kernel Func from GPU Launch"; let constructor = "xilinx::air::createGPUKernelOutlinePass()"; @@ -32,4 +49,107 @@ def ConvertGPUKernelOutline : Pass<"air-gpu-outlining", "ModuleOp"> { let options = []; } +def AIRGpuChannelToMgpu : Pass<"air-gpu-channel-to-mgpu", "ModuleOp"> { + let summary = "Lower air.channel.put/get of channel_type=\"gpu_symmetric_heap\" " + "to host-side mgpuMemcpy (peer-VA) + mgpuBarrier"; + let constructor = "xilinx::air::createAIRGpuChannelToMgpuPass()"; + let description = [{ + For each `air.channel @C [...] {channel_type = "gpu_symmetric_heap"}`, + pair its single `air.channel.put` and single `air.channel.get`. The put + becomes `mgpuBarrier()` (publish: data is already in the symmetric heap + via the put's `air.symmetric` source memref). The get becomes + `mgpuBarrier()` followed by `mgpuMemcpy(dst, peer_va(put_src), size)` + where the peer rank is the get's first index operand and the peer VA is + computed via `mgpuGetHeapBases()`. + + Restrictions in this initial version: + - One put and one get per channel symbol. + - Both put and get at host scope (no `gpu.launch`/`gpu.func`). + - put's source memref must be `air.symmetric`-tagged. + - get's destination memref must be in `memory_space=0`. + - "Entire memref" form only on both sides. + - get must take exactly one index operand (the peer rank). + }]; + let dependentDialects = [ + "func::FuncDialect", "arith::ArithDialect", "memref::MemRefDialect", + "LLVM::LLVMDialect" + ]; +} + +def AIRCrossRankDmaToMgpu : Pass<"air-cross-rank-dma-to-mgpu", "ModuleOp"> { + let summary = "Lower air.dma_memcpy_nd with src_rank/dst_rank to mgpuMemcpy " + "with peer-VA addressing through mgpuGetHeapBases()"; + let constructor = "xilinx::air::createAIRCrossRankDmaToMgpuPass()"; + let description = [{ + For each `air.dma_memcpy_nd` op carrying a `src_rank` or `dst_rank` + integer attribute, emit a host-side `mgpuMemcpy` whose peer-side pointer + is computed as `mgpuGetHeapBases()[peer] + (local_ptr - local_base)`. + + Restrictions in this initial version: + - Both `src` and `dst` memrefs must be in `memory_space=0`. + - The op must be at host scope (not inside any `gpu.launch`/`gpu.func`). + - "Entire memref" form only: `[]` `[]` `[]` for both sides — no + custom offsets / sizes / strides. + + Lower this pass *before* `air-symmetric-alloc-to-mgpu` so that pointer + extraction (`memref.extract_aligned_pointer_as_index`) sees plain + memrefs rather than already-cast LLVM struct values. + }]; + let dependentDialects = [ + "func::FuncDialect", "arith::ArithDialect", "memref::MemRefDialect", + "LLVM::LLVMDialect" + ]; +} + +def AIRSymmetricAllocToMgpu : Pass<"air-symmetric-alloc-to-mgpu", "ModuleOp"> { + let summary = "Lower memref.alloc {air.symmetric} to mgpuSymmetricAlloc and " + "memref.dealloc of the result to mgpuSymmetricFree"; + let constructor = "xilinx::air::createAIRSymmetricAllocToMgpuPass()"; + let description = [{ + Replaces each `memref.alloc` carrying the unit attribute `air.symmetric` + with a call to `mgpuSymmetricAlloc(size_in_bytes, stream)` returning + `!llvm.ptr`, then builds an LLVM memref descriptor (struct) wrapping that + pointer and projects it back to the original memref type via + `builtin.unrealized_conversion_cast` so downstream uses keep working. + + For every `memref.dealloc` whose operand traces back (through a single + `unrealized_conversion_cast`) to such a symmetric alloc, the pass emits + `mgpuSymmetricFree(ptr, stream)` and erases the dealloc. + + Should run before `convert-to-llvm`. Does nothing if no `air.symmetric` + allocations are present. + }]; + let dependentDialects = [ + "func::FuncDialect", "arith::ArithDialect", "LLVM::LLVMDialect" + ]; +} + +def AIRRankToMgpu : Pass<"air-rank-to-mgpu", "ModuleOp"> { + let summary = "Lower air.rank to mgpu* runtime calls (multi-GPU process model)"; + let constructor = "xilinx::air::createAIRRankToMgpuPass()"; + let description = [{ + Each `air.rank` op is replaced by inlining its body in place, with rank + IDs computed from `mgpuGetRank()` (delinearized into the rank's N-D + iteration space) and rank sizes substituted from the static size operands. + + The pass also inserts `mgpuSymmetricHeapInit(heap_size)` at the entry of + the enclosing `func.func` (default 256 MB; configurable via the + `heap-size` option) and `mgpuSymmetricHeapDestroy()` before each + `func.return` in that function. + + This replaces `air-rank-to-launch` for the GPU pipeline. Unlike + `air-rank-to-launch` (which serializes ranks via `scf.for`), this pass + assumes each process executes the whole rank body once and runtime + coordinates across processes via env vars (RANK / WORLD_SIZE / LOCAL_RANK) + and the symmetric-heap fabric. + }]; + let options = [ + Option<"heapSize", "heap-size", "uint64_t", "/*default=*/268435456", + "Symmetric heap size in bytes (default: 256 MB)"> + ]; + let dependentDialects = [ + "func::FuncDialect", "arith::ArithDialect" + ]; +} + #endif // AIR_CONVERSION_GPU_PASSES diff --git a/mlir/include/air/Dialect/AIR/AIR.td b/mlir/include/air/Dialect/AIR/AIR.td index 19575bb3e..74703144b 100644 --- a/mlir/include/air/Dialect/AIR/AIR.td +++ b/mlir/include/air/Dialect/AIR/AIR.td @@ -926,6 +926,43 @@ def air_ExecuteTerminatorOp : air_Op<"execute_terminator", [HasParent<"ExecuteOp [{ attr-dict ($results^ `:` type($results))? }]; } +def air_TranslateOp : air_Op<"translate", + [Pure, AllTypesMatch<["source", "result"]>]>, + Arguments<(ins AnyMemRef:$source, + Index:$from_rank, + Index:$to_rank, + MemRefRankOf<[Index], [1]>:$heap_bases)>, + Results<(outs AnyMemRef:$result)> { + let summary = "Re-express a symmetric-heap memref in another rank's address space"; + let description = [{ + Produces a memref of the same type as `$source` whose underlying + pointer references the corresponding allocation on `$to_rank`. The + `$source` memref is assumed to live on `$from_rank`'s symmetric heap. + The translation is the pointer rebase + + peer_va = bases[to_rank] + (source_ptr - bases[from_rank]) + + where `$heap_bases` is a 1-D memref of `index`-typed pointer values + (per-rank symmetric-heap base addresses) obtained from the + `mgpuGetHeapBases()` runtime hook. The host typically wraps the raw + runtime pointer as a `memref` once and threads it through + `gpu.launch_func` as a kernel argument. No data is moved; this op + produces a value-level "view" of peer memory. + + Folds to `$source` when `$from_rank` and `$to_rank` are statically + equal. + + Both ranks must address the same collective allocation on the + symmetric heap (i.e. `$source` must trace back to a + `memref.alloc {air.symmetric}`). Using this op outside that contract + is undefined. + }]; + let assemblyFormat = + [{ $source `,` $from_rank `,` $to_rank `,` $heap_bases + attr-dict `:` type($source) `,` type($heap_bases) }]; + let hasFolder = 1; +} + // AIR custom op, as a handle for a user-provided AIE kernel def air_CustomOp : air_Op<"custom", [air_AsyncOpInterface, diff --git a/mlir/lib/Conversion/AIRCrossRankDmaToMgpuPass.cpp b/mlir/lib/Conversion/AIRCrossRankDmaToMgpuPass.cpp new file mode 100644 index 000000000..34c7cee99 --- /dev/null +++ b/mlir/lib/Conversion/AIRCrossRankDmaToMgpuPass.cpp @@ -0,0 +1,247 @@ +//===- AIRCrossRankDmaToMgpuPass.cpp ---------------------------*- C++ -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// +// +// Lower air.dma_memcpy_nd ops carrying a `src_rank` or `dst_rank` integer +// attribute to host-side mgpuMemcpy calls with peer-VA addressing through +// mgpuGetHeapBases(). +// +// Pattern emitted (for src_rank = R): +// %size = arith.constant : i64 +// %nullptr = llvm.mlir.zero : !llvm.ptr +// %dst_ptr = (extract aligned ptr from %dst memref) +// %src_ptr = (extract aligned ptr from %src memref) +// %my_rank = call @mgpuGetRank() : () -> i32 +// %bases = call @mgpuGetHeapBases() : () -> !llvm.ptr +// %my_base_at = llvm.getelementptr %bases[%my_rank] : ... -> !llvm.ptr, !llvm.ptr +// %my_base = llvm.load %my_base_at : !llvm.ptr -> !llvm.ptr +// %src_int = llvm.ptrtoint %src_ptr : !llvm.ptr to i64 +// %my_base_int = llvm.ptrtoint %my_base : !llvm.ptr to i64 +// %offset = arith.subi %src_int, %my_base_int : i64 +// %peer_base_at = llvm.getelementptr %bases[] : ... -> !llvm.ptr, !llvm.ptr +// %peer_base = llvm.load %peer_base_at : !llvm.ptr -> !llvm.ptr +// %peer_src = llvm.getelementptr %peer_base[%offset] : ... -> !llvm.ptr, i8 +// call @mgpuMemcpy(%dst_ptr, %peer_src, %size, %nullptr) +// +// Initial restrictions: +// - Both memrefs must have memory_space=0 (L3/global). +// - Op must be at host scope (not inside a gpu.launch / gpu.func). +// - "Entire memref" form only: empty offsets/sizes/strides on both sides. +// +//===-----------------------------------------------------------------------===// + +#include "air/Conversion/AIRCrossRankDmaToMgpuPass.h" +#include "air/Conversion/GPUPassDetail.h" +#include "air/Dialect/AIR/AIRDialect.h" + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace xilinx; + +namespace { + +// Ensure a private extern func declaration exists at module scope. +static func::FuncOp ensureExternFunc(ModuleOp module, OpBuilder &builder, + StringRef name, FunctionType type) { + if (auto fn = module.lookupSymbol(name)) + return fn; + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(module.getBody()); + auto fn = func::FuncOp::create(builder, module.getLoc(), name, type); + fn.setPrivate(); + return fn; +} + +// Compute byte size of a static-shape memref as an i64 SSA value. +static Value computeMemrefByteSize(OpBuilder &b, Location loc, MemRefType ty) { + if (!ty.hasStaticShape()) + return nullptr; + int64_t numElts = 1; + for (int64_t d : ty.getShape()) + numElts *= d; + unsigned eltBits = ty.getElementType().getIntOrFloatBitWidth(); + if (eltBits == 0 || (eltBits % 8) != 0) + return nullptr; + int64_t totalBytes = numElts * (eltBits / 8); + return arith::ConstantOp::create(b, loc, b.getI64Type(), + b.getI64IntegerAttr(totalBytes)); +} + +// Extract an aligned !llvm.ptr from a memref via the standard idiom. +static Value extractAlignedPtr(OpBuilder &b, Location loc, Value memref) { + Value idx = memref::ExtractAlignedPointerAsIndexOp::create(b, loc, memref); + Value i64 = arith::IndexCastOp::create(b, loc, b.getI64Type(), idx); + auto ptrTy = LLVM::LLVMPointerType::get(b.getContext()); + return LLVM::IntToPtrOp::create(b, loc, ptrTy, i64); +} + +struct AIRCrossRankDmaToMgpuPass + : public xilinx::air::impl::AIRCrossRankDmaToMgpuBase< + AIRCrossRankDmaToMgpuPass> { + + AIRCrossRankDmaToMgpuPass() = default; + AIRCrossRankDmaToMgpuPass(const AIRCrossRankDmaToMgpuPass &) {} + + void runOnOperation() override { + auto module = getOperation(); + OpBuilder builder(module.getContext()); + auto i32Ty = builder.getI32Type(); + auto i64Ty = builder.getI64Type(); + auto ptrTy = LLVM::LLVMPointerType::get(module.getContext()); + + // Collect cross-rank DMA ops. + SmallVector crossRankDmas; + module.walk([&](air::DmaMemcpyNdOp op) { + if (op.hasCrossRank()) + crossRankDmas.push_back(op); + }); + if (crossRankDmas.empty()) + return; + + // Declare the runtime ABI functions we may need. + auto getRankFn = ensureExternFunc(module, builder, "mgpuGetRank", + builder.getFunctionType({}, {i32Ty})); + auto getBasesFn = + ensureExternFunc(module, builder, "mgpuGetHeapBases", + builder.getFunctionType({}, {ptrTy})); + auto memcpyFn = ensureExternFunc( + module, builder, "mgpuMemcpy", + builder.getFunctionType({ptrTy, ptrTy, i64Ty, ptrTy}, {})); + + for (air::DmaMemcpyNdOp dma : crossRankDmas) { + Location loc = dma.getLoc(); + + // Restrictions + if (dma->getParentOfType() || + dma->getParentOfType()) { + dma.emitOpError( + "cross-rank DMA inside a GPU kernel is not yet supported"); + signalPassFailure(); + return; + } + if (!dma.getSrcOffsets().empty() || !dma.getSrcSizes().empty() || + !dma.getSrcStrides().empty() || !dma.getDstOffsets().empty() || + !dma.getDstSizes().empty() || !dma.getDstStrides().empty()) { + dma.emitOpError("cross-rank DMA with explicit offsets/sizes/strides " + "is not yet supported"); + signalPassFailure(); + return; + } + + auto srcType = cast(dma.getSrcMemref().getType()); + auto dstType = cast(dma.getDstMemref().getType()); + if (srcType.getMemorySpaceAsInt() != 0 || + dstType.getMemorySpaceAsInt() != 0) { + dma.emitOpError( + "cross-rank DMA requires both memrefs in memory_space=0"); + signalPassFailure(); + return; + } + + // Determine which side has the rank attribute. (Only one is supported + // per op for now.) + bool srcIsPeer = dma.getSrcRank().has_value(); + bool dstIsPeer = dma.getDstRank().has_value(); + if (srcIsPeer && dstIsPeer) { + dma.emitOpError( + "cross-rank DMA with both src_rank and dst_rank set is not yet " + "supported"); + signalPassFailure(); + return; + } + int64_t peerRank = + srcIsPeer ? *dma.getSrcRank() : *dma.getDstRank(); + auto peerSideType = srcIsPeer ? srcType : dstType; + Value peerMemref = srcIsPeer ? dma.getSrcMemref() : dma.getDstMemref(); + Value localMemref = + srcIsPeer ? dma.getDstMemref() : dma.getSrcMemref(); + + builder.setInsertionPoint(dma); + Value sizeBytes = computeMemrefByteSize(builder, loc, peerSideType); + if (!sizeBytes) { + dma.emitOpError("cross-rank DMA requires static memref shape with " + "byte-aligned element type"); + signalPassFailure(); + return; + } + Value nullPtr = LLVM::ZeroOp::create(builder, loc, ptrTy); + + Value peerLocalPtr = extractAlignedPtr(builder, loc, peerMemref); + Value localPtr = extractAlignedPtr(builder, loc, localMemref); + + // bases = mgpuGetHeapBases() + Value bases = func::CallOp::create(builder, loc, getBasesFn, ValueRange{}) + .getResult(0); + + // my_rank = mgpuGetRank() (i32 -> i64) + Value myRankI32 = + func::CallOp::create(builder, loc, getRankFn, ValueRange{}) + .getResult(0); + Value myRankI64 = arith::ExtSIOp::create(builder, loc, i64Ty, myRankI32); + + // my_base = bases[my_rank] + Value myBaseAddr = LLVM::GEPOp::create(builder, loc, ptrTy, ptrTy, bases, + ArrayRef{myRankI64}); + Value myBase = LLVM::LoadOp::create(builder, loc, ptrTy, myBaseAddr); + + // peer_base = bases[] + Value peerRankIdx = LLVM::ConstantOp::create( + builder, loc, i64Ty, builder.getI64IntegerAttr(peerRank)); + Value peerBaseAddr = LLVM::GEPOp::create( + builder, loc, ptrTy, ptrTy, bases, ArrayRef{peerRankIdx}); + Value peerBase = LLVM::LoadOp::create(builder, loc, ptrTy, peerBaseAddr); + + // offset = peerLocalPtr (as i64) - my_base (as i64) + Value peerLocalInt = + LLVM::PtrToIntOp::create(builder, loc, i64Ty, peerLocalPtr); + Value myBaseInt = LLVM::PtrToIntOp::create(builder, loc, i64Ty, myBase); + Value offset = + arith::SubIOp::create(builder, loc, peerLocalInt, myBaseInt); + + // peer_ptr = peer_base + offset (byte-stride GEP) + auto i8Ty = builder.getI8Type(); + Value peerPtr = LLVM::GEPOp::create(builder, loc, ptrTy, i8Ty, peerBase, + ArrayRef{offset}); + + // mgpuMemcpy(dst, src, size, nullptr) — substitute peerPtr on the + // peer side. + Value srcArg = srcIsPeer ? peerPtr : localPtr; + Value dstArg = dstIsPeer ? peerPtr : localPtr; + func::CallOp::create(builder, loc, memcpyFn, + ValueRange{dstArg, srcArg, sizeBytes, nullPtr}); + + // If this DMA returned an async token, replace it with a wait_all. + if (dma.getAsyncToken()) { + Value tok = air::WaitAllOp::create( + builder, loc, + air::AsyncTokenType::get(builder.getContext()), + ValueRange{}) + .getAsyncToken(); + dma.getAsyncToken().replaceAllUsesWith(tok); + } + dma.erase(); + } + } +}; + +} // namespace + +namespace xilinx { +namespace air { + +std::unique_ptr createAIRCrossRankDmaToMgpuPass() { + return std::make_unique(); +} + +} // namespace air +} // namespace xilinx diff --git a/mlir/lib/Conversion/AIRGpuChannelToMgpuPass.cpp b/mlir/lib/Conversion/AIRGpuChannelToMgpuPass.cpp new file mode 100644 index 000000000..272ff456e --- /dev/null +++ b/mlir/lib/Conversion/AIRGpuChannelToMgpuPass.cpp @@ -0,0 +1,285 @@ +//===- AIRGpuChannelToMgpuPass.cpp ------------------------------*- C++ -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// +// +// Lower air.channel of channel_type="gpu_symmetric_heap" plus its put/get +// pair to host-side mgpuMemcpy with peer-VA addressing through +// mgpuGetHeapBases(), with mgpuBarrier-based synchronization. +// +// Per channel: +// - put becomes mgpuBarrier() (publish — the data is already in the +// symmetric heap via the put's air.symmetric source memref) +// - get becomes mgpuBarrier() followed by mgpuMemcpy(dst, peer_va(src), sz) +// where the peer rank is the get's first index operand +// +//===-----------------------------------------------------------------------===// + +#include "air/Conversion/AIRGpuChannelToMgpuPass.h" +#include "air/Conversion/GPUPassDetail.h" +#include "air/Dialect/AIR/AIRDialect.h" + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/SymbolTable.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace xilinx; + +namespace { + +static func::FuncOp ensureExternFunc(ModuleOp module, OpBuilder &builder, + StringRef name, FunctionType type) { + if (auto fn = module.lookupSymbol(name)) + return fn; + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(module.getBody()); + auto fn = func::FuncOp::create(builder, module.getLoc(), name, type); + fn.setPrivate(); + return fn; +} + +static Value computeMemrefByteSize(OpBuilder &b, Location loc, MemRefType ty) { + if (!ty.hasStaticShape()) + return nullptr; + int64_t numElts = 1; + for (int64_t d : ty.getShape()) + numElts *= d; + unsigned eltBits = ty.getElementType().getIntOrFloatBitWidth(); + if (eltBits == 0 || (eltBits % 8) != 0) + return nullptr; + int64_t totalBytes = numElts * (eltBits / 8); + return arith::ConstantOp::create(b, loc, b.getI64Type(), + b.getI64IntegerAttr(totalBytes)); +} + +static Value extractAlignedPtr(OpBuilder &b, Location loc, Value memref) { + Value idx = memref::ExtractAlignedPointerAsIndexOp::create(b, loc, memref); + Value i64 = arith::IndexCastOp::create(b, loc, b.getI64Type(), idx); + auto ptrTy = LLVM::LLVMPointerType::get(b.getContext()); + return LLVM::IntToPtrOp::create(b, loc, ptrTy, i64); +} + +struct AIRGpuChannelToMgpuPass + : public xilinx::air::impl::AIRGpuChannelToMgpuBase< + AIRGpuChannelToMgpuPass> { + + AIRGpuChannelToMgpuPass() = default; + AIRGpuChannelToMgpuPass(const AIRGpuChannelToMgpuPass &) {} + + void runOnOperation() override { + auto module = getOperation(); + OpBuilder builder(module.getContext()); + auto i32Ty = builder.getI32Type(); + auto i64Ty = builder.getI64Type(); + auto ptrTy = LLVM::LLVMPointerType::get(module.getContext()); + + // Collect gpu_symmetric_heap channel decls and their put/get sites. + SmallVector chans; + module.walk([&](air::ChannelOp ch) { + if (ch.getChannelType() == "gpu_symmetric_heap") + chans.push_back(ch); + }); + if (chans.empty()) + return; + + auto getRankFn = ensureExternFunc(module, builder, "mgpuGetRank", + builder.getFunctionType({}, {i32Ty})); + auto getBasesFn = + ensureExternFunc(module, builder, "mgpuGetHeapBases", + builder.getFunctionType({}, {ptrTy})); + auto memcpyFn = ensureExternFunc( + module, builder, "mgpuMemcpy", + builder.getFunctionType({ptrTy, ptrTy, i64Ty, ptrTy}, {})); + auto barrierFn = ensureExternFunc( + module, builder, "mgpuBarrier", builder.getFunctionType({}, {})); + + for (air::ChannelOp ch : chans) { + StringAttr sym = ch.getSymNameAttr(); + + // Find puts and gets that reference this channel symbol. + SmallVector puts; + SmallVector gets; + module.walk([&](air::ChannelPutOp p) { + if (p.getChanName() == sym.getValue()) + puts.push_back(p); + }); + module.walk([&](air::ChannelGetOp g) { + if (g.getChanName() == sym.getValue()) + gets.push_back(g); + }); + + if (puts.size() != 1 || gets.size() != 1) { + ch.emitOpError() + << "channel_type=\"gpu_symmetric_heap\" requires exactly one " + "put and one get per channel; found " + << puts.size() << " put(s), " << gets.size() << " get(s)"; + signalPassFailure(); + return; + } + air::ChannelPutOp put = puts.front(); + air::ChannelGetOp get = gets.front(); + + // Restrictions + if (put->getParentOfType() || + put->getParentOfType() || + get->getParentOfType() || + get->getParentOfType()) { + ch.emitOpError("gpu_symmetric_heap put/get inside a GPU kernel is " + "not yet supported"); + signalPassFailure(); + return; + } + if (!put.getSrcOffsets().empty() || !put.getSrcSizes().empty() || + !put.getSrcStrides().empty() || !get.getDstOffsets().empty() || + !get.getDstSizes().empty() || !get.getDstStrides().empty()) { + ch.emitOpError("gpu_symmetric_heap put/get with explicit " + "offsets/sizes/strides is not yet supported"); + signalPassFailure(); + return; + } + + auto srcType = cast(put.getSrc().getType()); + auto dstType = cast(get.getDst().getType()); + if (srcType.getMemorySpaceAsInt() != 0 || + dstType.getMemorySpaceAsInt() != 0) { + ch.emitOpError( + "gpu_symmetric_heap put/get requires both memrefs in memory_space=0"); + signalPassFailure(); + return; + } + + // The put's source must be air.symmetric so peers can read it. + if (auto allocOp = put.getSrc().getDefiningOp()) + if (!allocOp->hasAttr("air.symmetric")) { + ch.emitOpError("gpu_symmetric_heap put requires a memref.alloc " + "carrying the \"air.symmetric\" attribute"); + signalPassFailure(); + return; + } + + if (get.getIndices().size() != 1) { + ch.emitOpError("gpu_symmetric_heap get requires exactly one index " + "operand (the peer rank)"); + signalPassFailure(); + return; + } + Value peerRankIdx = get.getIndices().front(); + + // ---- Lower put: emit barrier (publish) and erase ---- + Location putLoc = put.getLoc(); + builder.setInsertionPointAfter(put); + func::CallOp::create(builder, putLoc, barrierFn, ValueRange{}); + if (put.getAsyncToken()) { + Value tok = air::WaitAllOp::create( + builder, putLoc, + air::AsyncTokenType::get(builder.getContext()), + ValueRange{}) + .getAsyncToken(); + put.getAsyncToken().replaceAllUsesWith(tok); + } + put.erase(); + + // ---- Lower get: barrier + cross-rank mgpuMemcpy(dst, peer_va(src), sz) ---- + Location getLoc = get.getLoc(); + builder.setInsertionPoint(get); + + // Barrier (consume) + func::CallOp::create(builder, getLoc, barrierFn, ValueRange{}); + + Value sizeBytes = computeMemrefByteSize(builder, getLoc, srcType); + if (!sizeBytes) { + ch.emitOpError("gpu_symmetric_heap requires static memref shape"); + signalPassFailure(); + return; + } + Value nullPtr = LLVM::ZeroOp::create(builder, getLoc, ptrTy); + + Value srcLocalPtr = extractAlignedPtr(builder, getLoc, put.getSrc()); + Value dstLocalPtr = extractAlignedPtr(builder, getLoc, get.getDst()); + + Value bases = + func::CallOp::create(builder, getLoc, getBasesFn, ValueRange{}) + .getResult(0); + Value myRankI32 = + func::CallOp::create(builder, getLoc, getRankFn, ValueRange{}) + .getResult(0); + Value myRankI64 = + arith::ExtSIOp::create(builder, getLoc, i64Ty, myRankI32); + Value myBaseAddr = LLVM::GEPOp::create(builder, getLoc, ptrTy, ptrTy, + bases, ArrayRef{myRankI64}); + Value myBase = LLVM::LoadOp::create(builder, getLoc, ptrTy, myBaseAddr); + + // Peer rank: convert dynamic index operand to i64. + Value peerRankI64; + Type peerTy = peerRankIdx.getType(); + if (isa(peerTy)) + peerRankI64 = arith::IndexCastOp::create(builder, getLoc, i64Ty, + peerRankIdx); + else if (auto intTy = dyn_cast(peerTy)) { + if (intTy.getWidth() == 64) + peerRankI64 = peerRankIdx; + else + peerRankI64 = + arith::ExtSIOp::create(builder, getLoc, i64Ty, peerRankIdx); + } else { + ch.emitOpError("gpu_symmetric_heap get peer-rank index must be index " + "or integer type"); + signalPassFailure(); + return; + } + + Value peerBaseAddr = LLVM::GEPOp::create( + builder, getLoc, ptrTy, ptrTy, bases, ArrayRef{peerRankI64}); + Value peerBase = + LLVM::LoadOp::create(builder, getLoc, ptrTy, peerBaseAddr); + + Value srcLocalInt = + LLVM::PtrToIntOp::create(builder, getLoc, i64Ty, srcLocalPtr); + Value myBaseInt = + LLVM::PtrToIntOp::create(builder, getLoc, i64Ty, myBase); + Value offset = + arith::SubIOp::create(builder, getLoc, srcLocalInt, myBaseInt); + + auto i8Ty = builder.getI8Type(); + Value peerSrc = LLVM::GEPOp::create(builder, getLoc, ptrTy, i8Ty, + peerBase, ArrayRef{offset}); + + func::CallOp::create( + builder, getLoc, memcpyFn, + ValueRange{dstLocalPtr, peerSrc, sizeBytes, nullPtr}); + + if (get.getAsyncToken()) { + Value tok = air::WaitAllOp::create( + builder, getLoc, + air::AsyncTokenType::get(builder.getContext()), + ValueRange{}) + .getAsyncToken(); + get.getAsyncToken().replaceAllUsesWith(tok); + } + get.erase(); + + // The channel symbol can now be erased. + ch.erase(); + } + } +}; + +} // namespace + +namespace xilinx { +namespace air { + +std::unique_ptr createAIRGpuChannelToMgpuPass() { + return std::make_unique(); +} + +} // namespace air +} // namespace xilinx diff --git a/mlir/lib/Conversion/AIRRankToMgpuPass.cpp b/mlir/lib/Conversion/AIRRankToMgpuPass.cpp new file mode 100644 index 000000000..654120cbc --- /dev/null +++ b/mlir/lib/Conversion/AIRRankToMgpuPass.cpp @@ -0,0 +1,181 @@ +//===- AIRRankToMgpuPass.cpp -----------------------------------*- C++ -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// +// +// Lower air.rank to mgpu* runtime calls (multi-GPU process model). +// +// Each `air.rank` op is replaced by inlining its body in place, with rank +// IDs computed from `mgpuGetRank()` (delinearized into the rank's N-D +// iteration space) and rank sizes substituted from the static size operands. +// +// The pass also inserts `mgpuSymmetricHeapInit(heap_size)` at the entry of +// the enclosing `func.func` and `mgpuSymmetricHeapDestroy()` before each +// `func.return` in that function. +// +//===-----------------------------------------------------------------------===// + +#include "air/Conversion/AIRRankToMgpuPass.h" +#include "air/Conversion/GPUPassDetail.h" +#include "air/Dialect/AIR/AIRDialect.h" + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace xilinx; + +namespace { + +// Ensure a private extern func declaration exists at the top of the module. +static func::FuncOp ensureExternFunc(ModuleOp module, OpBuilder &builder, + StringRef name, FunctionType type) { + if (auto fn = module.lookupSymbol(name)) + return fn; + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(module.getBody()); + auto fn = func::FuncOp::create(builder, module.getLoc(), name, type); + fn.setPrivate(); + return fn; +} + +struct AIRRankToMgpuPass + : public xilinx::air::impl::AIRRankToMgpuBase { + + AIRRankToMgpuPass() = default; + AIRRankToMgpuPass(const AIRRankToMgpuPass &pass) {} + + void runOnOperation() override { + auto module = getOperation(); + OpBuilder builder(module.getContext()); + auto i32Ty = builder.getI32Type(); + auto i64Ty = builder.getI64Type(); + auto idxTy = builder.getIndexType(); + + // Collect all air.rank ops and their parent functions. + SmallVector rankOps; + SetVector rankParentFuncs; + module.walk([&](air::RankOp op) { + rankOps.push_back(op); + if (auto fn = op->getParentOfType()) + rankParentFuncs.insert(fn); + }); + + // If no air.rank ops exist, leave the module untouched. + if (rankOps.empty()) + return; + + // Declare the mgpu* runtime ABI functions (only when needed). + auto initFn = ensureExternFunc(module, builder, "mgpuSymmetricHeapInit", + builder.getFunctionType({i64Ty}, {})); + auto destroyFn = + ensureExternFunc(module, builder, "mgpuSymmetricHeapDestroy", + builder.getFunctionType({}, {})); + auto getRankFn = ensureExternFunc(module, builder, "mgpuGetRank", + builder.getFunctionType({}, {i32Ty})); + + // For each parent function, insert mgpuSymmetricHeapInit at entry and + // mgpuSymmetricHeapDestroy before each return. + for (func::FuncOp fn : rankParentFuncs) { + if (fn.empty()) + continue; + Block &entry = fn.front(); + Location loc = fn.getLoc(); + builder.setInsertionPointToStart(&entry); + Value heapSizeVal = arith::ConstantOp::create( + builder, loc, i64Ty, + builder.getI64IntegerAttr(static_cast(heapSize))); + func::CallOp::create(builder, loc, initFn, ValueRange{heapSizeVal}); + + // Insert destroy before every return op. + SmallVector returns; + fn.walk([&](func::ReturnOp r) { returns.push_back(r); }); + for (func::ReturnOp r : returns) { + builder.setInsertionPoint(r); + func::CallOp::create(builder, r.getLoc(), destroyFn, ValueRange{}); + } + } + + // Lower each air.rank op. + for (air::RankOp rankOp : rankOps) { + builder.setInsertionPoint(rankOp); + Location loc = rankOp.getLoc(); + + // If the rank has async dependencies, insert a blocking wait before + // proceeding. + if (!rankOp.getAsyncDependencies().empty()) { + air::WaitAllOp::create(builder, loc, Type{}, + rankOp.getAsyncDependencies()); + } + + // Get the flat rank id from mgpuGetRank() and convert to index. + Value rankI32 = + func::CallOp::create(builder, loc, getRankFn, ValueRange{}) + .getResult(0); + Value rankI64 = + arith::ExtSIOp::create(builder, loc, i64Ty, rankI32); + Value flatRank = + arith::IndexCastOp::create(builder, loc, idxTy, rankI64); + + // Delinearize flatRank into N rank IDs using the static size operands. + // For sizes [s0, s1, ..., sn-1]: + // id[0] = flat % s0 + // id[1] = (flat / s0) % s1 + // ... + // id[n-1] = flat / (s0 * s1 * ... * sn-2) + auto sizeOpers = rankOp.getSizeOperands(); + unsigned n = rankOp.getNumDims(); + SmallVector ids(n); + Value remaining = flatRank; + for (unsigned d = 0; d < n; ++d) { + if (d == n - 1) { + ids[d] = remaining; + } else { + ids[d] = arith::RemSIOp::create(builder, loc, remaining, sizeOpers[d]); + remaining = + arith::DivSIOp::create(builder, loc, remaining, sizeOpers[d]); + } + } + + // Build remap and clone the body. + IRMapping remap; + for (unsigned d = 0; d < n; ++d) { + remap.map(rankOp.getIds()[d], ids[d]); + remap.map(rankOp.getSize()[d], sizeOpers[d]); + } + for (unsigned i = 0; i < rankOp.getNumKernelOperands(); ++i) + remap.map(rankOp.getKernelArgument(i), rankOp.getKernelOperand(i)); + + auto &ops = rankOp.getBody().front().getOperations(); + for (auto oi = ops.begin(), oe = --ops.end(); oi != oe; ++oi) + builder.clone(*oi, remap); + + // Replace the async token (if any) with a synchronous wait_all. + if (rankOp.getAsyncToken()) { + auto waitAll = air::WaitAllOp::create( + builder, loc, air::AsyncTokenType::get(builder.getContext()), + ValueRange{}); + rankOp.getAsyncToken().replaceAllUsesWith(waitAll.getAsyncToken()); + } + + rankOp.erase(); + } + } +}; + +} // namespace + +namespace xilinx { +namespace air { + +std::unique_ptr createAIRRankToMgpuPass() { + return std::make_unique(); +} + +} // namespace air +} // namespace xilinx diff --git a/mlir/lib/Conversion/AIRSymmetricAllocToMgpuPass.cpp b/mlir/lib/Conversion/AIRSymmetricAllocToMgpuPass.cpp new file mode 100644 index 000000000..864f89c1c --- /dev/null +++ b/mlir/lib/Conversion/AIRSymmetricAllocToMgpuPass.cpp @@ -0,0 +1,199 @@ +//===- AIRSymmetricAllocToMgpuPass.cpp -------------------------*- C++ -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// +// +// Lower memref.alloc carrying the `air.symmetric` attribute to a call to the +// runtime function `mgpuSymmetricAlloc`. The returned `!llvm.ptr` is wrapped +// in an LLVM memref descriptor (struct) and projected back to the original +// memref type via `builtin.unrealized_conversion_cast` so that downstream +// uses keep working. +// +// `memref.dealloc` ops whose operand traces (through a single +// `unrealized_conversion_cast`) back to a symmetric alloc are rewritten to +// `mgpuSymmetricFree`. +// +//===-----------------------------------------------------------------------===// + +#include "air/Conversion/AIRSymmetricAllocToMgpuPass.h" +#include "air/Conversion/GPUPassDetail.h" + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace xilinx; + +namespace { + +// Ensure a private extern func declaration exists at module scope. +static func::FuncOp ensureExternFunc(ModuleOp module, OpBuilder &builder, + StringRef name, FunctionType type) { + if (auto fn = module.lookupSymbol(name)) + return fn; + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(module.getBody()); + auto fn = func::FuncOp::create(builder, module.getLoc(), name, type); + fn.setPrivate(); + return fn; +} + +// Compute the byte size of a static-shaped memref as an i64 SSA value. +// Returns nullptr if the memref is dynamically shaped. +static Value computeMemrefByteSize(OpBuilder &b, Location loc, MemRefType ty) { + if (!ty.hasStaticShape()) + return nullptr; + int64_t numElts = 1; + for (int64_t d : ty.getShape()) + numElts *= d; + unsigned eltBits = ty.getElementType().getIntOrFloatBitWidth(); + if (eltBits == 0 || (eltBits % 8) != 0) + return nullptr; + int64_t totalBytes = numElts * (eltBits / 8); + return arith::ConstantOp::create(b, loc, b.getI64Type(), + b.getI64IntegerAttr(totalBytes)); +} + +// Build an LLVM memref descriptor struct populated with the given pointer. +// For now we support only static-shape, contiguous, identity-layout memrefs +// without an offset. For dimensions: sizes from the type, strides as +// row-major (innermost stride = 1). +static Value buildMemrefDescriptor(OpBuilder &b, Location loc, + MemRefType memrefTy, Value ptr) { + ArrayRef shape = memrefTy.getShape(); + unsigned rank = shape.size(); + auto i64Ty = b.getI64Type(); + auto ptrTy = LLVM::LLVMPointerType::get(b.getContext()); + + // Build the descriptor type: !llvm.struct<(ptr, ptr, i64, array, + // array)>. For rank-0 memrefs, MLIR omits the size/stride arrays. + SmallVector descFields; + descFields.push_back(ptrTy); + descFields.push_back(ptrTy); + descFields.push_back(i64Ty); + if (rank > 0) { + descFields.push_back(LLVM::LLVMArrayType::get(i64Ty, rank)); + descFields.push_back(LLVM::LLVMArrayType::get(i64Ty, rank)); + } + auto structTy = LLVM::LLVMStructType::getLiteral(b.getContext(), descFields); + + Value desc = LLVM::PoisonOp::create(b, loc, structTy); + desc = LLVM::InsertValueOp::create(b, loc, desc, ptr, ArrayRef{0}); + desc = LLVM::InsertValueOp::create(b, loc, desc, ptr, ArrayRef{1}); + Value zero = LLVM::ConstantOp::create(b, loc, i64Ty, b.getI64IntegerAttr(0)); + desc = LLVM::InsertValueOp::create(b, loc, desc, zero, ArrayRef{2}); + + if (rank > 0) { + // Compute row-major strides from shape (innermost = 1). + SmallVector strides(rank, 1); + for (int i = static_cast(rank) - 2; i >= 0; --i) + strides[i] = strides[i + 1] * shape[i + 1]; + for (unsigned i = 0; i < rank; ++i) { + Value sz = LLVM::ConstantOp::create(b, loc, i64Ty, + b.getI64IntegerAttr(shape[i])); + desc = LLVM::InsertValueOp::create(b, loc, desc, sz, + ArrayRef{3, (int64_t)i}); + Value st = LLVM::ConstantOp::create(b, loc, i64Ty, + b.getI64IntegerAttr(strides[i])); + desc = LLVM::InsertValueOp::create(b, loc, desc, st, + ArrayRef{4, (int64_t)i}); + } + } + return desc; +} + +struct AIRSymmetricAllocToMgpuPass + : public xilinx::air::impl::AIRSymmetricAllocToMgpuBase< + AIRSymmetricAllocToMgpuPass> { + + AIRSymmetricAllocToMgpuPass() = default; + AIRSymmetricAllocToMgpuPass(const AIRSymmetricAllocToMgpuPass &) {} + + void runOnOperation() override { + auto module = getOperation(); + OpBuilder builder(module.getContext()); + auto i64Ty = builder.getI64Type(); + auto ptrTy = LLVM::LLVMPointerType::get(module.getContext()); + + // Collect symmetric allocs. + SmallVector symAllocs; + module.walk([&](memref::AllocOp op) { + if (op->hasAttr("air.symmetric")) + symAllocs.push_back(op); + }); + + if (symAllocs.empty()) + return; + + auto allocFn = ensureExternFunc( + module, builder, "mgpuSymmetricAlloc", + builder.getFunctionType({i64Ty, ptrTy}, {ptrTy})); + auto freeFn = ensureExternFunc( + module, builder, "mgpuSymmetricFree", + builder.getFunctionType({ptrTy, ptrTy}, {})); + + // Track the !llvm.ptr backing each lowered memref so deallocs can look + // them up. + DenseMap symmetricMemrefToPtr; + + for (memref::AllocOp alloc : symAllocs) { + auto memrefTy = alloc.getType(); + Location loc = alloc.getLoc(); + builder.setInsertionPoint(alloc); + + Value sizeBytes = computeMemrefByteSize(builder, loc, memrefTy); + if (!sizeBytes) { + alloc.emitOpError( + "air.symmetric memref.alloc requires a static-shape memref with " + "byte-aligned element type"); + signalPassFailure(); + return; + } + Value nullPtr = LLVM::ZeroOp::create(builder, loc, ptrTy); + Value ptr = func::CallOp::create(builder, loc, allocFn, + ValueRange{sizeBytes, nullPtr}) + .getResult(0); + + Value desc = buildMemrefDescriptor(builder, loc, memrefTy, ptr); + Value newMemref = UnrealizedConversionCastOp::create( + builder, loc, TypeRange{memrefTy}, ValueRange{desc}) + .getResult(0); + symmetricMemrefToPtr[newMemref] = ptr; + alloc.getResult().replaceAllUsesWith(newMemref); + alloc.erase(); + } + + // Lower deallocs whose operand traces back to a symmetric alloc. + SmallVector deallocs; + module.walk([&](memref::DeallocOp op) { deallocs.push_back(op); }); + for (memref::DeallocOp d : deallocs) { + Value src = d.getMemref(); + auto it = symmetricMemrefToPtr.find(src); + if (it == symmetricMemrefToPtr.end()) + continue; // not a symmetric memref + builder.setInsertionPoint(d); + Value nullPtr = LLVM::ZeroOp::create(builder, d.getLoc(), ptrTy); + func::CallOp::create(builder, d.getLoc(), freeFn, + ValueRange{it->second, nullPtr}); + d.erase(); + } + } +}; + +} // namespace + +namespace xilinx { +namespace air { + +std::unique_ptr createAIRSymmetricAllocToMgpuPass() { + return std::make_unique(); +} + +} // namespace air +} // namespace xilinx diff --git a/mlir/lib/Conversion/AIRTranslateToLLVMPass.cpp b/mlir/lib/Conversion/AIRTranslateToLLVMPass.cpp new file mode 100644 index 000000000..eeae715b6 --- /dev/null +++ b/mlir/lib/Conversion/AIRTranslateToLLVMPass.cpp @@ -0,0 +1,175 @@ +//===- AIRTranslateToLLVMPass.cpp -------------------------------*- C++ -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// +// +// Lower air.translate to memref-descriptor construction over a peer-rebased +// pointer. +// +// For each `air.translate %src, %from, %to, %bases`: +// 1. Extract the source memref's aligned pointer as `index`. +// 2. Read per-rank base addresses from the heap_bases memref: +// from_base = bases[from] +// to_base = bases[to] +// via memref.load (each element is an `index` — a pointer-width +// integer). +// 3. Compute the peer aligned index: +// peer_aligned = src_aligned + (to_base - from_base) +// 4. Materialize the peer aligned address as !llvm.ptr (needed only for +// the descriptor build below — memref descriptors are LLVM structs). +// 5. Build a fresh LLVM memref descriptor (poison + insertvalue chain) +// whose allocated/aligned pointers both reference the peer address; +// offset = 0, sizes/strides come from the source memref's static type. +// 6. unrealized_conversion_cast the descriptor back to the result memref +// type so downstream uses keep working through the standard +// memref-to-llvm pipeline. +// +// Steps 1-3 use only memref + arith + index ops. The LLVM dialect appears +// only in steps 4-5 where it is unavoidable (memref descriptors *are* LLVM +// structs). The lowering is therefore valid both at host scope and inside +// `gpu.func` — the kernel just needs the heap_bases memref as an argument. +// +//===-----------------------------------------------------------------------===// + +#include "air/Conversion/AIRTranslateToLLVMPass.h" +#include "air/Conversion/GPUPassDetail.h" +#include "air/Dialect/AIR/AIRDialect.h" + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace xilinx; + +namespace { + +// Build a fresh LLVM memref descriptor for `memrefTy` whose +// allocated_ptr and aligned_ptr both reference `ptr`, offset is 0, and +// sizes/strides come from the static type (row-major). +// +// Mirrors buildMemrefDescriptor in AIRSymmetricAllocToMgpuPass. +static Value buildPeerDescriptor(OpBuilder &b, Location loc, + MemRefType memrefTy, Value ptr) { + ArrayRef shape = memrefTy.getShape(); + unsigned rank = shape.size(); + auto i64Ty = b.getI64Type(); + auto ptrTy = LLVM::LLVMPointerType::get(b.getContext()); + + SmallVector descFields; + descFields.push_back(ptrTy); + descFields.push_back(ptrTy); + descFields.push_back(i64Ty); + if (rank > 0) { + descFields.push_back(LLVM::LLVMArrayType::get(i64Ty, rank)); + descFields.push_back(LLVM::LLVMArrayType::get(i64Ty, rank)); + } + auto structTy = LLVM::LLVMStructType::getLiteral(b.getContext(), descFields); + + Value desc = LLVM::PoisonOp::create(b, loc, structTy); + desc = LLVM::InsertValueOp::create(b, loc, desc, ptr, ArrayRef{0}); + desc = LLVM::InsertValueOp::create(b, loc, desc, ptr, ArrayRef{1}); + Value zero = LLVM::ConstantOp::create(b, loc, i64Ty, b.getI64IntegerAttr(0)); + desc = LLVM::InsertValueOp::create(b, loc, desc, zero, ArrayRef{2}); + + if (rank > 0) { + SmallVector strides(rank, 1); + for (int i = static_cast(rank) - 2; i >= 0; --i) + strides[i] = strides[i + 1] * shape[i + 1]; + for (unsigned i = 0; i < rank; ++i) { + Value sz = LLVM::ConstantOp::create(b, loc, i64Ty, + b.getI64IntegerAttr(shape[i])); + desc = LLVM::InsertValueOp::create(b, loc, desc, sz, + ArrayRef{3, (int64_t)i}); + Value st = LLVM::ConstantOp::create(b, loc, i64Ty, + b.getI64IntegerAttr(strides[i])); + desc = LLVM::InsertValueOp::create(b, loc, desc, st, + ArrayRef{4, (int64_t)i}); + } + } + return desc; +} + +struct AIRTranslateToLLVMPass + : public xilinx::air::impl::AIRTranslateToLLVMBase { + + AIRTranslateToLLVMPass() = default; + AIRTranslateToLLVMPass(const AIRTranslateToLLVMPass &) {} + + void runOnOperation() override { + auto module = getOperation(); + auto *ctx = module.getContext(); + OpBuilder builder(ctx); + auto i64Ty = builder.getI64Type(); + auto ptrTy = LLVM::LLVMPointerType::get(ctx); + + SmallVector translates; + module.walk([&](air::TranslateOp op) { translates.push_back(op); }); + if (translates.empty()) + return; + + for (air::TranslateOp op : translates) { + builder.setInsertionPoint(op); + Location loc = op.getLoc(); + + auto memrefTy = cast(op.getSource().getType()); + if (!memrefTy.hasStaticShape()) { + op.emitOpError("air.translate requires a static-shape source memref"); + signalPassFailure(); + return; + } + + // Extract source aligned pointer (as index — pointer-width integer). + Value srcAlignedIdx = memref::ExtractAlignedPointerAsIndexOp::create( + builder, loc, op.getSource()); + + // Load bases[from] / bases[to] as index values. Each element of the + // heap_bases memref is a per-rank symmetric-heap base + // address stored as a pointer-width integer. + Value fromBaseIdx = memref::LoadOp::create( + builder, loc, op.getHeapBases(), ValueRange{op.getFromRank()}); + Value toBaseIdx = memref::LoadOp::create(builder, loc, op.getHeapBases(), + ValueRange{op.getToRank()}); + + // peer_aligned_idx = srcAlignedIdx + (toBaseIdx - fromBaseIdx) + Value diffIdx = + arith::SubIOp::create(builder, loc, toBaseIdx, fromBaseIdx); + Value peerAlignedIdx = + arith::AddIOp::create(builder, loc, srcAlignedIdx, diffIdx); + + // Materialize as !llvm.ptr for the descriptor build below (the + // descriptor's allocated/aligned-ptr fields are LLVM-typed because + // memref descriptors are LLVM structs). + Value peerAlignedI64 = + arith::IndexCastOp::create(builder, loc, i64Ty, peerAlignedIdx); + Value peerAlignedPtr = + LLVM::IntToPtrOp::create(builder, loc, ptrTy, peerAlignedI64); + + // Build a fresh memref descriptor with the peer aligned pointer. + Value desc = buildPeerDescriptor(builder, loc, memrefTy, peerAlignedPtr); + Value newMemref = UnrealizedConversionCastOp::create( + builder, loc, TypeRange{memrefTy}, ValueRange{desc}) + .getResult(0); + + op.getResult().replaceAllUsesWith(newMemref); + op.erase(); + } + } +}; + +} // namespace + +namespace xilinx { +namespace air { + +std::unique_ptr createAIRTranslateToLLVMPass() { + return std::make_unique(); +} + +} // namespace air +} // namespace xilinx diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index 46c0101b1..124a2dc6b 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -55,7 +55,12 @@ if(AIR_ENABLE_GPU) set(GPU_PASS_DEPENDS AIRGPUConversionPassIncGen) list(APPEND CONVERSION_SOURCES AIRToROCDLPass.cpp + AIRTranslateToLLVMPass.cpp GPUKernelOutlinePass.cpp + AIRRankToMgpuPass.cpp + AIRSymmetricAllocToMgpuPass.cpp + AIRCrossRankDmaToMgpuPass.cpp + AIRGpuChannelToMgpuPass.cpp ) list(APPEND CONVERSION_LINK_LIBS MLIRGPUDialect diff --git a/mlir/lib/Conversion/Passes.cpp b/mlir/lib/Conversion/Passes.cpp index b8342da3e..4fb3057f2 100644 --- a/mlir/lib/Conversion/Passes.cpp +++ b/mlir/lib/Conversion/Passes.cpp @@ -9,7 +9,12 @@ #include "air/Conversion/Passes.h" #if AIR_ENABLE_GPU +#include "air/Conversion/AIRCrossRankDmaToMgpuPass.h" +#include "air/Conversion/AIRGpuChannelToMgpuPass.h" +#include "air/Conversion/AIRRankToMgpuPass.h" +#include "air/Conversion/AIRSymmetricAllocToMgpuPass.h" #include "air/Conversion/AIRToROCDLPass.h" +#include "air/Conversion/AIRTranslateToLLVMPass.h" #include "air/Conversion/GPUKernelOutlinePass.h" #endif diff --git a/mlir/lib/Dialect/AIR/IR/AIRDialect.cpp b/mlir/lib/Dialect/AIR/IR/AIRDialect.cpp index 720d09a7f..eca0c7dd8 100644 --- a/mlir/lib/Dialect/AIR/IR/AIRDialect.cpp +++ b/mlir/lib/Dialect/AIR/IR/AIRDialect.cpp @@ -3598,6 +3598,20 @@ ParseResult air::CustomOp::parse(OpAsmParser &parser, OperationState &result) { return success(); } +// +// TranslateOp +// + +OpFoldResult air::TranslateOp::fold(FoldAdaptor adaptor) { + if (getFromRank() == getToRank()) + return getSource(); + auto fromAttr = dyn_cast_if_present(adaptor.getFromRank()); + auto toAttr = dyn_cast_if_present(adaptor.getToRank()); + if (fromAttr && toAttr && fromAttr.getValue() == toAttr.getValue()) + return getSource(); + return {}; +} + } // namespace xilinx #include "air/Dialect/AIR/AIROpInterfaces.cpp.inc" diff --git a/mlir/test/Conversion/AIRCrossRankDmaToMgpu/cross_rank_dma.mlir b/mlir/test/Conversion/AIRCrossRankDmaToMgpu/cross_rank_dma.mlir new file mode 100644 index 000000000..335c2ac5a --- /dev/null +++ b/mlir/test/Conversion/AIRCrossRankDmaToMgpu/cross_rank_dma.mlir @@ -0,0 +1,136 @@ +//===- cross_rank_dma.mlir --------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// + +// RUN: air-opt %s --split-input-file -air-cross-rank-dma-to-mgpu | FileCheck %s + +// Each test wraps the cross-rank dma in air.rank to satisfy the verifier +// (added in Phase 1) that requires an enclosing air.rank scope. + +// Basic src_rank: lower to mgpuMemcpy with peer-VA addressing. +// CHECK-LABEL: func.func @src_rank +// CHECK: arith.constant 4096 : i64 +// CHECK: llvm.mlir.zero : !llvm.ptr +// Extract pointers from both memrefs. +// CHECK: memref.extract_aligned_pointer_as_index +// CHECK: memref.extract_aligned_pointer_as_index +// Get bases and rank. +// CHECK: call @mgpuGetHeapBases() : () -> !llvm.ptr +// CHECK: call @mgpuGetRank() : () -> i32 +// CHECK: arith.extsi +// CHECK: llvm.getelementptr +// CHECK: llvm.load +// peer rank constant (0). +// CHECK: llvm.mlir.constant(0 : i64) +// CHECK: llvm.getelementptr +// CHECK: llvm.load +// offset = peer_local_int - my_base_int. +// CHECK: llvm.ptrtoint +// CHECK: llvm.ptrtoint +// CHECK: arith.subi +// peer_ptr = peer_base + offset (byte stride GEP). +// CHECK: llvm.getelementptr {{.*}} -> !llvm.ptr, i8 +// Final memcpy call. +// CHECK: call @mgpuMemcpy +// CHECK-NOT: air.dma_memcpy_nd +func.func @src_rank(%dst: memref<1024xf32>, %src: memref<1024xf32>) { + %c2 = arith.constant 2 : index + air.rank (%rid) in (%rsize = %c2) args(%d = %dst, %s = %src) + : memref<1024xf32>, memref<1024xf32> { + air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {src_rank = 0 : i64} + : (memref<1024xf32>, memref<1024xf32>) + air.rank_terminator + } + return +} + +// ----- + +// dst_rank: same lowering pattern, peer pointer becomes the dst arg. +// CHECK-LABEL: func.func @dst_rank +// CHECK: call @mgpuMemcpy +// CHECK-NOT: air.dma_memcpy_nd +func.func @dst_rank(%dst: memref<1024xf32>, %src: memref<1024xf32>) { + %c2 = arith.constant 2 : index + air.rank (%rid) in (%rsize = %c2) args(%d = %dst, %s = %src) + : memref<1024xf32>, memref<1024xf32> { + air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {dst_rank = 1 : i64} + : (memref<1024xf32>, memref<1024xf32>) + air.rank_terminator + } + return +} + +// ----- + +// 2D memref byte size: 64 * 64 * 4 = 16384. +// CHECK-LABEL: func.func @cross_rank_2d +// CHECK: arith.constant 16384 : i64 +// CHECK: call @mgpuMemcpy +func.func @cross_rank_2d(%dst: memref<64x64xf32>, %src: memref<64x64xf32>) { + %c2 = arith.constant 2 : index + air.rank (%rid) in (%rsize = %c2) args(%d = %dst, %s = %src) + : memref<64x64xf32>, memref<64x64xf32> { + air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {src_rank = 0 : i64} + : (memref<64x64xf32>, memref<64x64xf32>) + air.rank_terminator + } + return +} + +// ----- + +// f64 element type: 256 * 8 = 2048 bytes. +// CHECK-LABEL: func.func @cross_rank_f64 +// CHECK: arith.constant 2048 : i64 +func.func @cross_rank_f64(%dst: memref<256xf64>, %src: memref<256xf64>) { + %c2 = arith.constant 2 : index + air.rank (%rid) in (%rsize = %c2) args(%d = %dst, %s = %src) + : memref<256xf64>, memref<256xf64> { + air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {src_rank = 0 : i64} + : (memref<256xf64>, memref<256xf64>) + air.rank_terminator + } + return +} + +// ----- + +// Multiple cross-rank DMAs in one function: extern decls emitted exactly once. +// Match emission order from ensureExternFunc (insertion-at-top -> reverse). +// CHECK-COUNT-1: func.func private @mgpuMemcpy +// CHECK-NOT: func.func private @mgpuMemcpy +// CHECK-COUNT-1: func.func private @mgpuGetHeapBases +// CHECK-NOT: func.func private @mgpuGetHeapBases +// CHECK-COUNT-1: func.func private @mgpuGetRank +// CHECK-NOT: func.func private @mgpuGetRank +// CHECK-LABEL: func.func @two_dmas +// CHECK-COUNT-2: call @mgpuMemcpy +func.func @two_dmas(%dst: memref<32xf32>, %src: memref<32xf32>) { + %c2 = arith.constant 2 : index + air.rank (%rid) in (%rsize = %c2) args(%d = %dst, %s = %src) + : memref<32xf32>, memref<32xf32> { + air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {src_rank = 0 : i64} + : (memref<32xf32>, memref<32xf32>) + air.dma_memcpy_nd (%d[] [] [], %s[] [] []) {src_rank = 0 : i64} + : (memref<32xf32>, memref<32xf32>) + air.rank_terminator + } + return +} + +// ----- + +// LAST partition: pass is a no-op for non-cross-rank DMAs. +// CHECK-LABEL: func.func @no_cross_rank +// CHECK: air.dma_memcpy_nd +// CHECK-NOT: mgpuMemcpy +// CHECK-NOT: mgpuGetHeapBases +func.func @no_cross_rank(%dst: memref<1024xf32, 2>, %src: memref<1024xf32>) { + air.dma_memcpy_nd (%dst[] [] [], %src[] [] []) + : (memref<1024xf32, 2>, memref<1024xf32>) + return +} diff --git a/mlir/test/Conversion/AIRGpuChannelToMgpu/gpu_channel.mlir b/mlir/test/Conversion/AIRGpuChannelToMgpu/gpu_channel.mlir new file mode 100644 index 000000000..64da49ab0 --- /dev/null +++ b/mlir/test/Conversion/AIRGpuChannelToMgpu/gpu_channel.mlir @@ -0,0 +1,87 @@ +//===- gpu_channel.mlir -----------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// + +// RUN: air-opt %s --split-input-file -air-gpu-channel-to-mgpu | FileCheck %s + +// Basic put/get pair with peer-rank index. The put becomes a barrier; the +// get becomes barrier + cross-rank mgpuMemcpy. +// CHECK-LABEL: func.func @basic_pair +// CHECK-NOT: air.channel @ +// CHECK: arith.constant 0 : index +// Inside the rank body: put -> barrier +// CHECK: call @mgpuBarrier +// CHECK-NOT: air.channel.put +// Then: get -> barrier + memcpy with peer-VA addressing. +// CHECK: call @mgpuBarrier +// CHECK: arith.constant 4096 : i64 +// CHECK: memref.extract_aligned_pointer_as_index +// CHECK: memref.extract_aligned_pointer_as_index +// CHECK: call @mgpuGetHeapBases +// CHECK: call @mgpuGetRank +// CHECK: llvm.getelementptr +// CHECK: llvm.load +// peer rank = constant 0 (peer index from get). +// CHECK: arith.index_cast +// CHECK: llvm.getelementptr +// CHECK: llvm.load +// offset = src_int - my_base_int. +// CHECK: llvm.ptrtoint +// CHECK: llvm.ptrtoint +// CHECK: arith.subi +// peer_src = peer_base + offset (byte stride). +// CHECK: llvm.getelementptr {{.*}} -> !llvm.ptr, i8 +// CHECK: call @mgpuMemcpy +// CHECK-NOT: air.channel.get +air.channel @sym_chan [] {channel_type = "gpu_symmetric_heap"} +func.func @basic_pair(%src: memref<1024xf32>, %dst: memref<1024xf32>) { + %c2 = arith.constant 2 : index + air.rank (%rid) in (%rsize = %c2) args(%s = %src, %d = %dst) + : memref<1024xf32>, memref<1024xf32> { + %c0 = arith.constant 0 : index + %sym = memref.alloc() {air.symmetric} : memref<1024xf32> + air.channel.put @sym_chan[] (%sym[] [] []) : (memref<1024xf32>) + air.channel.get @sym_chan[%c0] (%d[] [] []) : (memref<1024xf32>) + memref.dealloc %sym : memref<1024xf32> + air.rank_terminator + } + return +} + +// ----- + +// Channel decl is erased after lowering (the channel symbol no longer +// exists in the lowered IR). +// CHECK-LABEL: func.func @decl_erased +// CHECK-NOT: air.channel @sym_chan2 +air.channel @sym_chan2 [] {channel_type = "gpu_symmetric_heap"} +func.func @decl_erased(%dst: memref<32xf32>) { + %c2 = arith.constant 2 : index + air.rank (%rid) in (%rsize = %c2) args(%d = %dst) + : memref<32xf32> { + %c0 = arith.constant 0 : index + %sym = memref.alloc() {air.symmetric} : memref<32xf32> + air.channel.put @sym_chan2[] (%sym[] [] []) : (memref<32xf32>) + air.channel.get @sym_chan2[%c0] (%d[] [] []) : (memref<32xf32>) + memref.dealloc %sym : memref<32xf32> + air.rank_terminator + } + return +} + +// ----- + +// LAST partition: pass is a no-op for non-gpu_symmetric_heap channels. +// (npu_dma_stream channels must be left alone for the AIE backend.) +// CHECK-LABEL: func.func @no_gpu_channel +// CHECK: air.channel.put @npu_chan +// CHECK-NOT: mgpuMemcpy +// CHECK-NOT: mgpuGetHeapBases +air.channel @npu_chan [] {channel_type = "npu_dma_stream"} +func.func @no_gpu_channel(%src: memref<32xf32>) { + air.channel.put @npu_chan[] (%src[] [] []) : (memref<32xf32>) + return +} diff --git a/mlir/test/Conversion/AIRRankToMgpu/rank_to_mgpu.mlir b/mlir/test/Conversion/AIRRankToMgpu/rank_to_mgpu.mlir new file mode 100644 index 000000000..067547ee4 --- /dev/null +++ b/mlir/test/Conversion/AIRRankToMgpu/rank_to_mgpu.mlir @@ -0,0 +1,189 @@ +//===- rank_to_mgpu.mlir ----------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// + +// RUN: air-opt %s --split-input-file -air-rank-to-mgpu | FileCheck %s +// RUN: air-opt %s --split-input-file -air-rank-to-mgpu='heap-size=536870912' | FileCheck %s --check-prefix=HEAPOPT + +// CHECK-LABEL: func.func @test_rank_1d +// CHECK: call @mgpuSymmetricHeapInit +// CHECK-NOT: air.rank +// CHECK: %[[R:.*]] = call @mgpuGetRank() : () -> i32 +// CHECK: arith.extsi %[[R]] : i32 to i64 +// CHECK: arith.index_cast +// CHECK: call @mgpuSymmetricHeapDestroy +// CHECK: return + +// HEAPOPT-LABEL: func.func @test_rank_1d +// HEAPOPT: arith.constant 536870912 : i64 +// HEAPOPT: call @mgpuSymmetricHeapInit +func.func @test_rank_1d(%arg0: memref<16x16xf32>) { + %c2 = arith.constant 2 : index + air.rank (%rx) in (%sx = %c2) args(%a=%arg0) : memref<16x16xf32> { + %c1 = arith.constant 1 : index + air.launch (%lx) in (%ls = %c1) args(%la=%a) : memref<16x16xf32> { + air.launch_terminator + } + } + return +} + +// ----- + +// CHECK-LABEL: func.func @test_rank_2d +// 2D rank delinearization: id_x = flat % sx, id_y = flat / sx +// CHECK: %[[FLAT:.*]] = arith.index_cast +// CHECK: %[[IDX:.*]] = arith.remsi %[[FLAT]], %{{.*}} +// CHECK: %[[IDY:.*]] = arith.divsi %[[FLAT]], %{{.*}} +// CHECK-NOT: air.rank +func.func @test_rank_2d(%arg0: memref<16x16xf32>) { + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + air.rank (%rx, %ry) in (%sx = %c2, %sy = %c4) args(%a=%arg0) : memref<16x16xf32> { + %c1 = arith.constant 1 : index + air.launch (%lx) in (%ls = %c1) args(%la=%a) : memref<16x16xf32> { + air.launch_terminator + } + } + return +} + +// ----- + +// Default heap size is 256 MB = 268435456. +// CHECK-LABEL: func.func @test_rank_default_heap +// CHECK: arith.constant 268435456 : i64 +// CHECK: call @mgpuSymmetricHeapInit +func.func @test_rank_default_heap() { + %c2 = arith.constant 2 : index + air.rank (%rx) in (%sx = %c2) { + } + return +} + +// ----- + +// Async form: air.rank with async result token. Pass should produce a wait_all +// to replace the token, and the body should still be inlined. +// CHECK-LABEL: func.func @test_rank_async +// CHECK: call @mgpuSymmetricHeapInit +// CHECK: call @mgpuGetRank +// CHECK-NOT: air.rank +// CHECK: air.wait_all +// CHECK: call @mgpuSymmetricHeapDestroy +func.func @test_rank_async() -> !air.async.token { + %c2 = arith.constant 2 : index + %t = air.rank async (%rx) in (%sx = %c2) { + } + return %t : !air.async.token +} + +// ----- + +// Async dependency: air.rank async [%dep]. Pass must insert a blocking +// wait_all on the dependency before lowering the rank body. +// CHECK-LABEL: func.func @test_rank_async_dep +// CHECK: %[[DEP:.*]] = air.wait_all async +// CHECK: air.wait_all [%[[DEP]]] +// CHECK: call @mgpuGetRank +// CHECK-NOT: air.rank +func.func @test_rank_async_dep() { + %c2 = arith.constant 2 : index + %dep = air.wait_all async + %t = air.rank async [%dep] (%rx) in (%sx = %c2) { + } + return +} + +// ----- + +// Multiple air.rank ops in one function: heap init should appear once +// (at function entry) and destroy once (before return), regardless of how +// many rank ops are inlined. Each rank produces its own mgpuGetRank(). +// CHECK-LABEL: func.func @test_multiple_ranks +// CHECK-COUNT-1: call @mgpuSymmetricHeapInit +// CHECK-COUNT-2: call @mgpuGetRank +// CHECK-COUNT-1: call @mgpuSymmetricHeapDestroy +// CHECK-NOT: air.rank +func.func @test_multiple_ranks() { + %c2 = arith.constant 2 : index + air.rank (%rx) in (%sx = %c2) { + } + air.rank (%rx) in (%sx = %c2) { + } + return +} + +// ----- + +// Multiple returns: destroy should be inserted before EACH return path. +// CHECK-LABEL: func.func @test_multiple_returns +// CHECK-COUNT-1: call @mgpuSymmetricHeapInit +// CHECK-COUNT-2: call @mgpuSymmetricHeapDestroy +func.func @test_multiple_returns(%cond: i1) { + %c2 = arith.constant 2 : index + air.rank (%rx) in (%sx = %c2) { + } + cf.cond_br %cond, ^bb1, ^bb2 +^bb1: + return +^bb2: + return +} + +// ----- + +// Kernel operand mapping: a value passed as args(%a=%arg0) should be +// substituted into the inlined body so that uses of the block arg are +// replaced with the original SSA value. +// CHECK-LABEL: func.func @test_kernel_args( +// CHECK-SAME: %[[ARG0:.*]]: memref<16x16xf32> +// CHECK-NOT: air.rank +// The store should reference the function arg directly, not a block arg. +// CHECK: memref.store %{{.*}}, %[[ARG0]] +func.func @test_kernel_args(%arg0: memref<16x16xf32>) { + %c2 = arith.constant 2 : index + air.rank (%rx) in (%sx = %c2) args(%a=%arg0) : memref<16x16xf32> { + %cst = arith.constant 0.0 : f32 + %c0 = arith.constant 0 : index + memref.store %cst, %a[%c0, %c0] : memref<16x16xf32> + } + return +} + +// ----- + +// Idempotent extern decls: only one decl of each mgpu* function in the +// module, even with multiple ranks across multiple functions. +// CHECK-COUNT-1: func.func private @mgpuGetRank +// CHECK-NOT: func.func private @mgpuGetRank +// CHECK-COUNT-1: func.func private @mgpuSymmetricHeapDestroy +// CHECK-NOT: func.func private @mgpuSymmetricHeapDestroy +// CHECK-COUNT-1: func.func private @mgpuSymmetricHeapInit +// CHECK-NOT: func.func private @mgpuSymmetricHeapInit +func.func @test_decls_in_func_a() { + %c2 = arith.constant 2 : index + air.rank (%rx) in (%sx = %c2) {} + return +} +func.func @test_decls_in_func_b() { + %c2 = arith.constant 2 : index + air.rank (%rx) in (%sx = %c2) {} + return +} + +// ----- + +// A function with NO air.rank should be left completely untouched. +// (Placed last in the file so CHECK-NOTs aren't matched against later +// partitions that legitimately contain mgpu* decls.) +// CHECK-LABEL: func.func @test_no_rank +// CHECK-NOT: mgpuSymmetricHeapInit +// CHECK-NOT: mgpuSymmetricHeapDestroy +// CHECK-NOT: mgpuGetRank +func.func @test_no_rank(%arg0: memref<16x16xf32>) -> memref<16x16xf32> { + return %arg0 : memref<16x16xf32> +} diff --git a/mlir/test/Conversion/AIRSymmetricAllocToMgpu/symmetric_alloc.mlir b/mlir/test/Conversion/AIRSymmetricAllocToMgpu/symmetric_alloc.mlir new file mode 100644 index 000000000..b0e30f0c7 --- /dev/null +++ b/mlir/test/Conversion/AIRSymmetricAllocToMgpu/symmetric_alloc.mlir @@ -0,0 +1,106 @@ +//===- symmetric_alloc.mlir -------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// + +// RUN: air-opt %s --split-input-file -air-symmetric-alloc-to-mgpu | FileCheck %s + +// Basic 1D alloc + dealloc. +// CHECK-LABEL: func.func @basic_alloc_dealloc +// CHECK: %[[SZ:.*]] = arith.constant 4096 : i64 +// CHECK: %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr +// CHECK: %[[PTR:.*]] = call @mgpuSymmetricAlloc(%[[SZ]], %[[NULL]]) : (i64, !llvm.ptr) -> !llvm.ptr +// Descriptor build (poison + insertvalue) then unrealized cast. +// CHECK: llvm.mlir.poison +// CHECK: llvm.insertvalue %[[PTR]] +// CHECK: llvm.insertvalue %[[PTR]] +// CHECK: builtin.unrealized_conversion_cast {{.*}} : !llvm.struct<{{.*}}> to memref<1024xf32> +// Dealloc -> mgpuSymmetricFree. +// CHECK: call @mgpuSymmetricFree(%[[PTR]], +// CHECK-NOT: memref.alloc +// CHECK-NOT: memref.dealloc +func.func @basic_alloc_dealloc() { + %buf = memref.alloc() {air.symmetric} : memref<1024xf32> + memref.dealloc %buf : memref<1024xf32> + return +} + +// ----- + +// 2D alloc: 64*64*4 = 16384 bytes; descriptor strides should be [64, 1]. +// CHECK-LABEL: func.func @alloc_2d +// CHECK: arith.constant 16384 : i64 +// CHECK: call @mgpuSymmetricAlloc +// Strides 64 then 1 in the descriptor (innermost-most-contiguous). +// CHECK: llvm.mlir.constant(64 : i64) +// CHECK: llvm.insertvalue +// CHECK: llvm.mlir.constant(1 : i64) +// CHECK: llvm.insertvalue +// CHECK: builtin.unrealized_conversion_cast {{.*}} : !llvm.struct<{{.*}}> to memref<64x64xf32> +func.func @alloc_2d() -> memref<64x64xf32> { + %buf = memref.alloc() {air.symmetric} : memref<64x64xf32> + return %buf : memref<64x64xf32> +} + +// ----- + +// f64 element type (8 bytes): 1024 * 8 = 8192 bytes. +// CHECK-LABEL: func.func @f64_element +// CHECK: arith.constant 8192 : i64 +func.func @f64_element() { + %buf = memref.alloc() {air.symmetric} : memref<1024xf64> + memref.dealloc %buf : memref<1024xf64> + return +} + +// ----- + +// i32 element type (4 bytes): 256 * 4 = 1024 bytes. +// CHECK-LABEL: func.func @i32_element +// CHECK: arith.constant 1024 : i64 +func.func @i32_element() { + %buf = memref.alloc() {air.symmetric} : memref<256xi32> + memref.dealloc %buf : memref<256xi32> + return +} + +// ----- + +// Multiple symmetric allocs in one function: each lowered independently; +// extern decls are emitted exactly once at module scope. +// Match the actual emission order: Free decl before Alloc decl. +// CHECK-COUNT-1: func.func private @mgpuSymmetricFree +// CHECK-NOT: func.func private @mgpuSymmetricFree +// CHECK-COUNT-1: func.func private @mgpuSymmetricAlloc +// CHECK-NOT: func.func private @mgpuSymmetricAlloc +// CHECK-LABEL: func.func @two_allocs +// CHECK-COUNT-2: call @mgpuSymmetricAlloc +// CHECK-COUNT-2: call @mgpuSymmetricFree +func.func @two_allocs() { + %a = memref.alloc() {air.symmetric} : memref<32xf32> + %b = memref.alloc() {air.symmetric} : memref<64xf32> + memref.dealloc %a : memref<32xf32> + memref.dealloc %b : memref<64xf32> + return +} + +// ----- + +// LAST partition: cases that test the pass leaves things untouched. +// Both `ignores_non_symmetric` and `no_symmetric_alloc` are folded here +// so the trailing CHECK-NOTs only need to match against this one (final) +// partition's text. +// CHECK-LABEL: func.func @no_symmetric_changes +// CHECK: memref.alloc() : memref<1024xf32> +// CHECK: memref.alloc() : memref<32xf32> +// CHECK-NOT: mgpuSymmetricAlloc +// CHECK-NOT: mgpuSymmetricFree +func.func @no_symmetric_changes() { + %a = memref.alloc() : memref<1024xf32> + memref.dealloc %a : memref<1024xf32> + %b = memref.alloc() : memref<32xf32> + memref.dealloc %b : memref<32xf32> + return +} diff --git a/mlir/test/Conversion/AIRToROCDL/air_translate_to_llvm.mlir b/mlir/test/Conversion/AIRToROCDL/air_translate_to_llvm.mlir new file mode 100644 index 000000000..84f96db8c --- /dev/null +++ b/mlir/test/Conversion/AIRToROCDL/air_translate_to_llvm.mlir @@ -0,0 +1,86 @@ +//===- air_translate_to_llvm.mlir - air-translate-to-llvm pass -----------===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// + +// REQUIRES: gpu +// RUN: air-opt --air-translate-to-llvm --split-input-file %s | FileCheck %s + +// 1D static memref: full peer-VA expansion shape. +// CHECK-LABEL: func.func @translate_1d +// CHECK-DAG: %[[SRC_IDX:.+]] = memref.extract_aligned_pointer_as_index %arg0 +// CHECK-DAG: %[[FROM_BASE:.+]] = memref.load %arg3[%arg1] : memref +// CHECK-DAG: %[[TO_BASE:.+]] = memref.load %arg3[%arg2] : memref +// CHECK: %[[DIFF:.+]] = arith.subi %[[TO_BASE]], %[[FROM_BASE]] +// CHECK: %[[PEER_IDX:.+]] = arith.addi %[[SRC_IDX]], %[[DIFF]] +// CHECK: %[[PEER_I64:.+]] = arith.index_cast %[[PEER_IDX]] : index to i64 +// CHECK: %[[PEER_PTR:.+]] = llvm.inttoptr %[[PEER_I64]] : i64 to !llvm.ptr +// CHECK: %[[POISON:.+]] = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> +// CHECK: %[[D0:.+]] = llvm.insertvalue %[[PEER_PTR]], %[[POISON]][0] +// CHECK: %[[D1:.+]] = llvm.insertvalue %[[PEER_PTR]], %[[D0]][1] +// CHECK: %{{.*}} = llvm.mlir.constant(0 : i64) +// CHECK: %[[D2:.+]] = llvm.insertvalue %{{.*}}, %[[D1]][2] +// CHECK: %{{.*}} = llvm.mlir.constant(1024 : i64) +// CHECK: %[[D3:.+]] = llvm.insertvalue %{{.*}}, %[[D2]][3, 0] +// CHECK: %{{.*}} = llvm.mlir.constant(1 : i64) +// CHECK: %[[D4:.+]] = llvm.insertvalue %{{.*}}, %[[D3]][4, 0] +// CHECK: %[[CAST:.+]] = builtin.unrealized_conversion_cast %[[D4]] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<1024xf32> +// CHECK: return %[[CAST]] +// CHECK-NOT: air.translate +func.func @translate_1d(%src : memref<1024xf32>, %from : index, %to : index, %bases : memref) -> memref<1024xf32> { + %peer = air.translate %src, %from, %to, %bases : memref<1024xf32>, memref + return %peer : memref<1024xf32> +} + +// ----- + +// 2D static memref: descriptor includes row-major strides [64, 1]. +// CHECK-LABEL: func.func @translate_2d +// CHECK: memref.load %arg3[%arg1] : memref +// CHECK: memref.load %arg3[%arg2] : memref +// CHECK: llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> +// CHECK-DAG: llvm.mlir.constant(64 : i64) +// CHECK-DAG: llvm.mlir.constant(1 : i64) +// CHECK: builtin.unrealized_conversion_cast {{.*}} to memref<64x64xf32, 1> +// CHECK-NOT: air.translate +func.func @translate_2d(%src : memref<64x64xf32, 1>, %from : index, %to : index, %bases : memref) -> memref<64x64xf32, 1> { + %peer = air.translate %src, %from, %to, %bases : memref<64x64xf32, 1>, memref + return %peer : memref<64x64xf32, 1> +} + +// ----- + +// Inside a gpu.func (kernel-side use): same expansion shape — purely +// memref + arith ops, no runtime call. +// CHECK-LABEL: gpu.func @kernel +// CHECK: memref.extract_aligned_pointer_as_index +// CHECK: memref.load %arg3[%arg1] : memref +// CHECK: memref.load %arg3[%arg2] : memref +// CHECK: arith.subi +// CHECK: arith.addi +// CHECK: builtin.unrealized_conversion_cast {{.*}} to memref<1024xf32, 1> +// CHECK: memref.store +// CHECK-NOT: air.translate +gpu.module @kernels { + gpu.func @kernel(%data : memref<1024xf32, 1>, %from : index, %to : index, %bases : memref) kernel { + %peer = air.translate %data, %from, %to, %bases : memref<1024xf32, 1>, memref + %c0 = arith.constant 0 : index + %c42 = arith.constant 42.0 : f32 + memref.store %c42, %peer[%c0] : memref<1024xf32, 1> + gpu.return + } +} + +// ----- + +// No air.translate: pass is a no-op. +// CHECK-LABEL: func.func @noop +// CHECK-NEXT: return +// CHECK-NOT: memref.extract_aligned_pointer_as_index +// CHECK-NOT: llvm.mlir.poison +func.func @noop(%a : memref<8xf32>) -> memref<8xf32> { + return %a : memref<8xf32> +} + diff --git a/mlir/test/Conversion/AIRToROCDL/sym_atomic_syncscope.mlir b/mlir/test/Conversion/AIRToROCDL/sym_atomic_syncscope.mlir new file mode 100644 index 000000000..4092f6fa5 --- /dev/null +++ b/mlir/test/Conversion/AIRToROCDL/sym_atomic_syncscope.mlir @@ -0,0 +1,40 @@ +//===- sym_atomic_syncscope.mlir - cross-XGMI atomic preservation --------===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// +// +// The symmetric-heap producer/consumer test relies on a contract that +// `llvm.atomicrmw release` and `llvm.load atomic acquire` ops emitted with +// `syncscope("")` (= LLVM IR's System scope = cross-device on AMDGPU) +// survive the GPU compilation pipeline unchanged. Without that, the +// producer's release-store on rank 0's GPU is not seen by the consumer's +// acquire-load on rank 1's GPU, and the consumer hangs forever (test +// times out — appears as "no crash, no signal, just dead"). +// +// The empty-string syncscope is LLVM IR's canonical spelling of System +// scope (LLVM's textual IR omits the `syncscope(...)` token entirely when +// scope == System; MLIR's LLVM dialect round-trips it as `syncscope("")`). +// AMDGPU's LangRef defines System as cross-device: +// https://llvm.org/docs/AMDGPUUsage.html#memory-model +// +// This test asserts that after `convert-gpu-to-rocdl` the atomic ops +// retain their ordering and the explicit `syncscope("")` qualifier. +// +//===-----------------------------------------------------------------------===// + +// REQUIRES: gpu +// RUN: air-opt --pass-pipeline='builtin.module(rocdl-attach-target{chip=gfx942 O=3},gpu.module(convert-scf-to-cf,convert-gpu-to-rocdl{chipset=gfx942 runtime=HIP},reconcile-unrealized-casts))' %s | FileCheck %s + +// CHECK-LABEL: gpu.module @kernels +// CHECK-LABEL: llvm.func @atomic_kernel +// CHECK: llvm.atomicrmw xchg %{{.*}}, %{{.*}} syncscope("") release : !llvm.ptr, i32 +// CHECK: llvm.load %{{.*}} atomic syncscope("") acquire {{.*}} : !llvm.ptr -> i32 +gpu.module @kernels { + gpu.func @atomic_kernel(%ptr : !llvm.ptr, %v : i32) kernel { + %old = llvm.atomicrmw xchg %ptr, %v syncscope("") release : !llvm.ptr, i32 + %loaded = llvm.load %ptr atomic syncscope("") acquire {alignment = 4 : i64} : !llvm.ptr -> i32 + gpu.return + } +} diff --git a/mlir/test/Dialect/AIR/air_translate.mlir b/mlir/test/Dialect/AIR/air_translate.mlir new file mode 100644 index 000000000..c107da0c8 --- /dev/null +++ b/mlir/test/Dialect/AIR/air_translate.mlir @@ -0,0 +1,63 @@ +//===- air_translate.mlir - air.translate parser, printer, folder --------===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// + +// RUN: air-opt %s | FileCheck %s +// RUN: air-opt --canonicalize %s | FileCheck %s --check-prefix=FOLD + +// Round-trip: 1D static memref. +// CHECK-LABEL: func.func @translate_1d +// CHECK: %{{.*}} = air.translate %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<1024xf32>, memref +func.func @translate_1d(%src : memref<1024xf32>, %from : index, %to : index, %bases : memref) -> memref<1024xf32> { + %peer = air.translate %src, %from, %to, %bases : memref<1024xf32>, memref + return %peer : memref<1024xf32> +} + +// Round-trip: 2D static memref in address space 1. +// CHECK-LABEL: func.func @translate_2d_addrspace +// CHECK: air.translate %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<64x64xf32, 1>, memref +func.func @translate_2d_addrspace(%src : memref<64x64xf32, 1>, %from : index, %to : index, %bases : memref) -> memref<64x64xf32, 1> { + %peer = air.translate %src, %from, %to, %bases : memref<64x64xf32, 1>, memref + return %peer : memref<64x64xf32, 1> +} + +// Round-trip: static-shaped heap_bases is also accepted. +// CHECK-LABEL: func.func @translate_static_bases +// CHECK: air.translate %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<8xf32>, memref<8xindex> +func.func @translate_static_bases(%src : memref<8xf32>, %from : index, %to : index, %bases : memref<8xindex>) -> memref<8xf32> { + %peer = air.translate %src, %from, %to, %bases : memref<8xf32>, memref<8xindex> + return %peer : memref<8xf32> +} + +// Folder: from_rank == to_rank (same SSA value) folds to %src. +// FOLD-LABEL: func.func @fold_same_rank +// FOLD-NOT: air.translate +// FOLD: return %arg0 : memref<8xf32> +func.func @fold_same_rank(%src : memref<8xf32>, %r : index, %bases : memref) -> memref<8xf32> { + %peer = air.translate %src, %r, %r, %bases : memref<8xf32>, memref + return %peer : memref<8xf32> +} + +// Folder: distinct constants with same value also fold. +// FOLD-LABEL: func.func @fold_constant_eq_ranks +// FOLD-NOT: air.translate +// FOLD: return %arg0 : memref<8xf32> +func.func @fold_constant_eq_ranks(%src : memref<8xf32>, %bases : memref) -> memref<8xf32> { + %c2 = arith.constant 2 : index + %c2_again = arith.constant 2 : index + %peer = air.translate %src, %c2, %c2_again, %bases : memref<8xf32>, memref + return %peer : memref<8xf32> +} + +// Non-fold: distinct constants do NOT fold. +// FOLD-LABEL: func.func @no_fold_distinct_constants +// FOLD: air.translate +func.func @no_fold_distinct_constants(%src : memref<8xf32>, %bases : memref) -> memref<8xf32> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %peer = air.translate %src, %c0, %c1, %bases : memref<8xf32>, memref + return %peer : memref<8xf32> +} diff --git a/test/gpu/symmetric_heap_dma/air_sym_handwritten_atomic.mlir b/test/gpu/symmetric_heap_dma/air_sym_handwritten_atomic.mlir new file mode 100644 index 000000000..a0743e60c --- /dev/null +++ b/test/gpu/symmetric_heap_dma/air_sym_handwritten_atomic.mlir @@ -0,0 +1,330 @@ +//===- air_sym_handwritten_atomic.mlir - multi-GPU e2e (atomic flag) ------===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===------------------------------------------------------------------===// +// +// Symmetric-heap producer/consumer e2e (WORLD_SIZE=2), atomic-flag variant. +// Sister file: air_sym_handwritten_cacheline.mlir uses cache-line atomicity +// instead of LLVM atomics for the cross-rank handoff. +// +// rank 0 launches @producer; rank 1 launches @consumer. +// producer writes 42.0 into rank 1's `data` over XGMI; per-warp flags +// (4 i32, in rank 1's HBM) signal completion via release atomicrmw with +// syncscope("") (= LLVM System scope = cross-device on AMDGPU). +// consumer's lane 0 acquires on its flag, then all 64 lanes copy +// the local data slot to verify_buf for host check. +// Block: 1 grid × 256 threads = 4 warps × 64 lanes. +// +// Synchronization contract is spec-defined: see sym_atomic_syncscope.mlir +// for the FileCheck contract test that pins the lowering behavior. +// +// Launcher: run.sh forks N processes with RANK / WORLD_SIZE / LOCAL_RANK. +// +//===------------------------------------------------------------------===// + +module attributes {gpu.container_module} { + // ---- mgpu* C ABI declarations ----------------------------------------- + func.func private @mgpuSymmetricHeapInit(i64) + func.func private @mgpuSymmetricHeapDestroy() + func.func private @mgpuGetRank() -> i32 + func.func private @mgpuGetWorldSize() -> i32 + func.func private @mgpuSymmetricAlloc(i64, !llvm.ptr) -> !llvm.ptr + func.func private @mgpuSymmetricFree(!llvm.ptr, !llvm.ptr) + func.func private @mgpuGetHeapBases() -> !llvm.ptr + func.func private @mgpuBarrier() + func.func private @mgpuMemAlloc(i64, !llvm.ptr, i1) -> !llvm.ptr + func.func private @mgpuMemFree(!llvm.ptr, !llvm.ptr) + func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) + + // libc exit — verify branch calls this on any mismatch so run.sh + // sees a non-zero process exit (no green-without-validation). + func.func private @exit(i32) + + llvm.func @printf(!llvm.ptr, ...) -> i32 + + llvm.mlir.global internal constant @msg_init( + "[mlir] rank %d / world %d, init OK\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_pass_p( + "[mlir] rank 0 (producer): kernel returned\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_pass_c( + "[mlir] rank 1 (consumer): cross-rank kernel write PASS (verify[0]=%.1f)\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_fail( + "[mlir] rank 1 (consumer): MISMATCH at idx=%ld got=%.1f expected=42.0\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_done( + "[mlir] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32} + + // ---- GPU kernels ------------------------------------------------------ + gpu.module @sym_kernels { + + // Drop a memref<4xi32> + warp index to a raw !llvm.ptr to the warp's + // flag slot. We must drop to llvm.ptr because memref dialect atomics + // (memref.atomic_rmw, memref.generic_atomic_rmw) lack ordering and + // syncscope today, and there is no memref.atomic_load/store at all. + // TODO: when upstream memref grows ordering+syncscope (track in + // mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td), inline this and + // use the memref-level ops directly. + func.func private @flag_slot_ptr(%flags : memref<4xi32>, %wid : index) -> !llvm.ptr { + %p_idx = memref.extract_aligned_pointer_as_index %flags : memref<4xi32> -> index + %p_int = arith.index_cast %p_idx : index to i64 + %p = llvm.inttoptr %p_int : i64 to !llvm.ptr + %w64 = arith.index_cast %wid : index to i64 + %slot = llvm.getelementptr %p[%w64] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + return %slot : !llvm.ptr + } + + // Producer: each thread stores 42.0 into peer's data; lane 0 of each + // warp release-atomicrmws peer's per-warp flag. + gpu.func @producer(%data : memref<256xf32>, + %flags : memref<4xi32>, + %bases : memref) kernel + attributes {gpu.known_block_size = array, + gpu.known_grid_size = array} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1_i32 = arith.constant 1 : i32 + %c42_f = arith.constant 42.0 : f32 + %from = arith.constant 0 : index // rank 0 (producer) + %to = arith.constant 1 : index // rank 1 (consumer) + + %tid = gpu.thread_id x + %wid = arith.divui %tid, %c64 : index + %lane = arith.remui %tid, %c64 : index + + %peer_data = air.translate %data, %from, %to, %bases : memref<256xf32>, memref + %peer_flags = air.translate %flags, %from, %to, %bases : memref<4xi32>, memref + memref.store %c42_f, %peer_data[%tid] : memref<256xf32> + + %is_lane0 = arith.cmpi eq, %lane, %c0 : index + scf.if %is_lane0 { + // syncscope("") = LLVM System scope = cross-device on AMDGPU. + // See sym_atomic_syncscope.mlir for the contract test. + %slot_ptr = func.call @flag_slot_ptr(%peer_flags, %wid) + : (memref<4xi32>, index) -> !llvm.ptr + %old = llvm.atomicrmw xchg %slot_ptr, %c1_i32 syncscope("") release + : !llvm.ptr, i32 + } + gpu.return + } + + // Consumer: lane 0 acquires on its flag; then all 64 lanes copy + // their data slot into verify_buf for host check. + gpu.func @consumer(%data : memref<256xf32>, + %verify_buf : memref<256xf32>, + %flags : memref<4xi32>) kernel + attributes {gpu.known_block_size = array, + gpu.known_grid_size = array} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c0_i32 = arith.constant 0 : i32 + + %tid = gpu.thread_id x + %wid = arith.divui %tid, %c64 : index + %lane = arith.remui %tid, %c64 : index + + %is_lane0 = arith.cmpi eq, %lane, %c0 : index + scf.if %is_lane0 { + %slot_ptr = func.call @flag_slot_ptr(%flags, %wid) + : (memref<4xi32>, index) -> !llvm.ptr + // Spin: flag == 0. + scf.while : () -> () { + %v = llvm.load %slot_ptr atomic syncscope("") acquire + {alignment = 4 : i64} : !llvm.ptr -> i32 + %not_ready = arith.cmpi eq, %v, %c0_i32 : i32 + scf.condition(%not_ready) + } do { + scf.yield + } + } + // No gpu.barrier: on AMDGPU lanes within a wave execute in SIMT + // lockstep, so lanes 1..63 cannot leave the scf.if before lane 0 + // does, and the wave-shared L1 means lane 0's syncscope("") acquire + // makes the producer's writes visible to the whole wave. + %v = memref.load %data[%tid] : memref<256xf32> + memref.store %v, %verify_buf[%tid] : memref<256xf32> + gpu.return + } + } + + // ---- Helpers ---------------------------------------------------------- + // Single ABI-leaking helper: wrap a raw runtime !llvm.ptr as a 1-D byte + // memref. All typed views below derive from this via memref.view, so the + // hand-built LLVM-struct descriptor literal lives in exactly one place. + // Phase 4's AIRSymmetricAllocToMgpuPass will replace this entirely. + func.func private @wrap_bytes(%ptr : !llvm.ptr, %size : i64) -> memref { + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %d0 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %d1 = llvm.insertvalue %ptr, %d0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %d2 = llvm.insertvalue %ptr, %d1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %d3 = llvm.insertvalue %c0_i64, %d2[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %d4 = llvm.insertvalue %size, %d3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %d5 = llvm.insertvalue %c1_i64, %d4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %m = builtin.unrealized_conversion_cast %d5 + : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref + return %m : memref + } + + // ---- main ------------------------------------------------------------ + func.func @main() { + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c0_i64 = arith.constant 0 : i64 + %c1024_bytes = arith.constant 1024 : i64 // 256 f32 = 1024 bytes + %c16_bytes = arith.constant 16 : i64 // 4 i32 = 16 bytes + %heap_size = arith.constant 268435456 : i64 // 256 MB + %nullptr = llvm.mlir.zero : !llvm.ptr + %false = arith.constant false + + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + + // Heap init (collective). + func.call @mgpuSymmetricHeapInit(%heap_size) : (i64) -> () + %rank = func.call @mgpuGetRank() : () -> i32 + %world = func.call @mgpuGetWorldSize() : () -> i32 + %fmt_init = llvm.mlir.addressof @msg_init : !llvm.ptr + llvm.call @printf(%fmt_init, %rank, %world) + vararg(!llvm.func) : (!llvm.ptr, i32, i32) -> i32 + + // Symmetric allocations: data (256 f32) + flags (4 i32). + %data_ptr = func.call @mgpuSymmetricAlloc(%c1024_bytes, %nullptr) : (i64, !llvm.ptr) -> !llvm.ptr + %flags_ptr = func.call @mgpuSymmetricAlloc(%c16_bytes, %nullptr) : (i64, !llvm.ptr) -> !llvm.ptr + + // Zero-init flags from host so the consumer's spin starts at 0. + %flags_host = memref.alloc() : memref<4xi32> + %fc0 = arith.constant 0 : index + %fc1 = arith.constant 1 : index + %fc4 = arith.constant 4 : index + scf.for %i = %fc0 to %fc4 step %fc1 { + memref.store %c0_i32, %flags_host[%i] : memref<4xi32> + } + %flags_host_intptr = memref.extract_aligned_pointer_as_index %flags_host + : memref<4xi32> -> index + %flags_host_int = arith.index_cast %flags_host_intptr : index to i64 + %flags_host_ptr = llvm.inttoptr %flags_host_int : i64 to !llvm.ptr + func.call @mgpuMemcpy(%flags_ptr, %flags_host_ptr, %c16_bytes, %nullptr) + : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + memref.dealloc %flags_host : memref<4xi32> + + func.call @mgpuBarrier() : () -> () // flags init visible to all ranks + + %c0_view = arith.constant 0 : index + %data_bytes = func.call @wrap_bytes(%data_ptr, %c1024_bytes) : (!llvm.ptr, i64) -> memref + %flags_bytes = func.call @wrap_bytes(%flags_ptr, %c16_bytes) : (!llvm.ptr, i64) -> memref + %data_m = memref.view %data_bytes[%c0_view][] : memref to memref<256xf32> + %flags_m = memref.view %flags_bytes[%c0_view][] : memref to memref<4xi32> + + // mgpuGetHeapBases() returns a HOST pointer; GPU can't deref it, so + // copy to device. TODO(airgpu): make heap_bases device-accessible + // (hipMallocManaged / hipHostMalloc-Mapped) and drop this copy. + %world_i64 = arith.extui %world : i32 to i64 + %c8_i64 = arith.constant 8 : i64 + %bases_size = arith.muli %world_i64, %c8_i64 : i64 + %bases_host = func.call @mgpuGetHeapBases() : () -> !llvm.ptr + %bases_devptr = func.call @mgpuMemAlloc(%bases_size, %nullptr, %false) + : (i64, !llvm.ptr, i1) -> !llvm.ptr + func.call @mgpuMemcpy(%bases_devptr, %bases_host, %bases_size, %nullptr) + : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + %bases_bytes = func.call @wrap_bytes(%bases_devptr, %bases_size) : (!llvm.ptr, i64) -> memref + %world_idx = arith.index_cast %world_i64 : i64 to index + %bases = memref.view %bases_bytes[%c0_view][%world_idx] : memref to memref + + // Rank 0 = producer, rank 1 = consumer. Ranks > 1 idle. + // (Future: extend to all-pairs producer/consumer mesh.) + // Precondition: world >= 2 — enforced by run.sh, not re-checked here. + %is_producer = arith.cmpi eq, %rank, %c0_i32 : i32 + scf.if %is_producer { + gpu.launch_func @sym_kernels::@producer + blocks in (%c1, %c1, %c1) + threads in (%c256, %c1, %c1) + args(%data_m : memref<256xf32>, + %flags_m : memref<4xi32>, + %bases : memref) + %fmt_p = llvm.mlir.addressof @msg_pass_p : !llvm.ptr + llvm.call @printf(%fmt_p) + vararg(!llvm.func) : (!llvm.ptr) -> i32 + } else { + %is_consumer = arith.cmpi eq, %rank, %c1_i32 : i32 + scf.if %is_consumer { + %verify_ptr = func.call @mgpuMemAlloc(%c1024_bytes, %nullptr, %false) + : (i64, !llvm.ptr, i1) -> !llvm.ptr + %verify_bytes = func.call @wrap_bytes(%verify_ptr, %c1024_bytes) : (!llvm.ptr, i64) -> memref + %verify_m = memref.view %verify_bytes[%c0_view][] : memref to memref<256xf32> + gpu.launch_func @sym_kernels::@consumer + blocks in (%c1, %c1, %c1) + threads in (%c256, %c1, %c1) + args(%data_m : memref<256xf32>, + %verify_m: memref<256xf32>, + %flags_m : memref<4xi32>) + + // D2H readback verify_buf and check ALL 256 elements == 42.0. + // (Checking only element 0 would mask a bug where warps 1..3 + // didn't write their slice. exit(1) on mismatch makes the + // multi-process driver see a non-zero exit code.) + %hb = memref.alloc() : memref<256xf32> + %hb_intptr = memref.extract_aligned_pointer_as_index %hb : memref<256xf32> -> index + %hb_int = arith.index_cast %hb_intptr : index to i64 + %hb_ptr = llvm.inttoptr %hb_int : i64 to !llvm.ptr + func.call @mgpuMemcpy(%hb_ptr, %verify_ptr, %c1024_bytes, %nullptr) + : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + + %c0_idx = arith.constant 0 : index + %c1_idx = arith.constant 1 : index + %c256_idx = arith.constant 256 : index + %expected = arith.constant 42.0 : f32 + + // Count mismatches; print msg_fail on the first. + %nfail = scf.for %i = %c0_idx to %c256_idx step %c1_idx + iter_args(%nfail_acc = %c0_i32) -> (i32) { + %v = memref.load %hb[%i] : memref<256xf32> + %ne = arith.cmpf une, %v, %expected : f32 + %new_nfail = scf.if %ne -> i32 { + %is_first = arith.cmpi eq, %nfail_acc, %c0_i32 : i32 + scf.if %is_first { + %fmt_fail = llvm.mlir.addressof @msg_fail : !llvm.ptr + %i_i64 = arith.index_cast %i : index to i64 + %v_64 = arith.extf %v : f32 to f64 + %e_64 = arith.extf %expected : f32 to f64 + llvm.call @printf(%fmt_fail, %rank, %i_i64, %v_64, %e_64) + vararg(!llvm.func) + : (!llvm.ptr, i32, i64, f64, f64) -> i32 + } + %inc = arith.addi %nfail_acc, %c1_i32 : i32 + scf.yield %inc : i32 + } else { + scf.yield %nfail_acc : i32 + } + scf.yield %new_nfail : i32 + } + + %ok_all = arith.cmpi eq, %nfail, %c0_i32 : i32 + scf.if %ok_all { + %fmt_c = llvm.mlir.addressof @msg_pass_c : !llvm.ptr + %v0 = memref.load %hb[%c0_idx] : memref<256xf32> + %v0_64 = arith.extf %v0 : f32 to f64 + llvm.call @printf(%fmt_c, %v0_64) + vararg(!llvm.func) : (!llvm.ptr, f64) -> i32 + } else { + func.call @exit(%c1_i32) : (i32) -> () + } + + memref.dealloc %hb : memref<256xf32> + func.call @mgpuMemFree(%verify_ptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () + } + } + + func.call @mgpuBarrier() : () -> () + func.call @mgpuMemFree(%bases_devptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () + func.call @mgpuSymmetricFree(%data_ptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () + func.call @mgpuSymmetricFree(%flags_ptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () + func.call @mgpuSymmetricHeapDestroy() : () -> () + + %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr + llvm.call @printf(%fmt_done, %rank) + vararg(!llvm.func) : (!llvm.ptr, i32) -> i32 + + return + } +} diff --git a/test/gpu/symmetric_heap_dma/air_sym_handwritten_cacheline.mlir b/test/gpu/symmetric_heap_dma/air_sym_handwritten_cacheline.mlir new file mode 100644 index 000000000..5c65a6bd0 --- /dev/null +++ b/test/gpu/symmetric_heap_dma/air_sym_handwritten_cacheline.mlir @@ -0,0 +1,358 @@ +//===- air_sym_handwritten_cacheline.mlir - multi-GPU e2e (cache line) ----===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===------------------------------------------------------------------===// +// +// Symmetric-heap producer/consumer e2e (WORLD_SIZE=2), cache-line variant. +// Sister file: air_sym_handwritten_atomic.mlir uses LLVM atomicrmw / atomic +// load with syncscope("") for the cross-rank handoff. +// +// rank 0 launches @producer; rank 1 launches @consumer. +// +// Message-passing via cache-line atomicity (no atomics, no fences) +// ================================================================ +// +// Assuming one cache line = 128 bytes = 32 i32: +// +// ┌─────────────────────────────────────────────────────┐ +// │ 128-byte cache line │ +// ├────┬────┬────┬────┬─── ··· ───┬────┬───────────────┤ +// lane: │ 0 │ 1 │ 2 │ 3 │ │ 30 │ 31 ◄── flag │ +// ├────┼────┼────┼────┤ ├────┼───────────────┤ +// init: │ 0 │ 0 │ 0 │ 0 │ 0 ··· │ 0 │ 0 │ +// ├────┼────┼────┼────┤ ├────┼───────────────┤ +// prod: │100 │101 │102 │103 │ lane+100 │130 │ 1 │ +// └────┴────┴────┴────┴─── ··· ───┴────┴───────────────┘ +// +// Producer (rank 0, 1 wave × 64 lanes): +// data[lane] = (lane == 31) ? 1 : (lane + 100) // single vec store +// +// Consumer (rank 1, 1 wave × 64 lanes), spin loop: +// v = data[lane] // single vec load +// flag = gpu.shuffle idx v, lane=31, width=64 // broadcast lane 31's val +// if flag == 1: break, else retry +// +// Why this works on gfx940 / MI300: +// - Producer's vec-store commits the whole 128-byte cache line as one HW +// transaction; lane 31's "1" is published with the same coherence event +// as lanes 0..30's payload (the compiler cannot split a uniform vector +// store of 32 i32 into per-lane sub-stores). +// - The XGMI coherence fabric on MI300 publishes peer cache lines whole +// (not per-lane), so when consumer's lane 31 observes flag==1, lanes +// 0..30 of the same line are guaranteed visible from this load. +// - shuffle-broadcast of the flag is wave-uniform, so all 64 lanes break +// in lockstep; no need for control-flow synchronization. +// +// Trade-off vs the previous LLVM-atomic design: this trades a spec-defined +// ordering contract (atomicrmw release / atomic load acquire with +// syncscope("") = AMDGPUUsage System) for a microarchitectural one. It is +// simpler and matches how real GPU code does fast intra-rank handoff, but +// the atomicity guarantee is not in the AMDGPU LangRef the way LLVM atomic +// scopes are. +// +// Note on lanes 32..63: data is sized to one cache line (32 i32), so only +// lanes 0..31 access it. Lanes 32..63 still participate in gpu.shuffle so +// the shuffle stays wave-uniform; their loads are guarded by `lane < 32`. +// +// Launcher: run.sh forks N processes with RANK / WORLD_SIZE / LOCAL_RANK. +// +//===------------------------------------------------------------------===// + +module attributes {gpu.container_module} { + // ---- mgpu* C ABI declarations ----------------------------------------- + func.func private @mgpuSymmetricHeapInit(i64) + func.func private @mgpuSymmetricHeapDestroy() + func.func private @mgpuGetRank() -> i32 + func.func private @mgpuGetWorldSize() -> i32 + func.func private @mgpuSymmetricAlloc(i64, !llvm.ptr) -> !llvm.ptr + func.func private @mgpuSymmetricFree(!llvm.ptr, !llvm.ptr) + func.func private @mgpuGetHeapBases() -> !llvm.ptr + func.func private @mgpuBarrier() + func.func private @mgpuMemAlloc(i64, !llvm.ptr, i1) -> !llvm.ptr + func.func private @mgpuMemFree(!llvm.ptr, !llvm.ptr) + func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) + + // libc exit — verify branch calls this on any mismatch so run.sh + // sees a non-zero process exit (no green-without-validation). + func.func private @exit(i32) + + llvm.func @printf(!llvm.ptr, ...) -> i32 + + llvm.mlir.global internal constant @msg_init( + "[mlir] rank %d / world %d, init OK\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_pass_p( + "[mlir] rank 0 (producer): kernel returned\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_pass_c( + "[mlir] rank 1 (consumer): cache-line message PASS (data[0]=%d, flag=%d)\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_fail( + "[mlir] rank 1 (consumer): MISMATCH at idx=%ld got=%d expected=%d\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_done( + "[mlir] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32} + + // ---- GPU kernels ------------------------------------------------------ + gpu.module @sym_kernels { + + // Producer: 1 wave × 64 lanes; lanes 0..31 write one cache line into + // peer's data buffer, with lane 31 == 1 (flag) and lanes 0..30 == + // lane+100 (payload). Lanes 32..63 idle. + gpu.func @producer(%data : memref<32xi32>, + %bases : memref) kernel + attributes {gpu.known_block_size = array, + gpu.known_grid_size = array} { + %c1_i32 = arith.constant 1 : i32 + %c100_i32 = arith.constant 100 : i32 + %c31 = arith.constant 31 : index + %c32 = arith.constant 32 : index + %from = arith.constant 0 : index // rank 0 (producer) + %to = arith.constant 1 : index // rank 1 (consumer) + + %tid = gpu.thread_id x + %active = arith.cmpi ult, %tid, %c32 : index + %peer_data = air.translate %data, %from, %to, %bases + : memref<32xi32>, memref + + scf.if %active { + %is_flag = arith.cmpi eq, %tid, %c31 : index + %tid_i32 = arith.index_cast %tid : index to i32 + %payload = arith.addi %tid_i32, %c100_i32 : i32 + %val = arith.select %is_flag, %c1_i32, %payload : i32 + memref.store %val, %peer_data[%tid] : memref<32xi32> + } + gpu.return + } + + // Consumer: 1 wave × 64 lanes; spin on local data (already peer-mapped + // by symmetric heap), broadcasting lane 31 via gpu.shuffle until it + // observes flag==1. Then lanes 0..31 store their loaded value into + // verify_buf for host check. + gpu.func @consumer(%data : memref<32xi32>, + %verify_buf : memref<32xi32>) kernel + attributes {gpu.known_block_size = array, + gpu.known_grid_size = array} { + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c31_i32 = arith.constant 31 : i32 + %c64_i32 = arith.constant 64 : i32 + %c32 = arith.constant 32 : index + + %tid = gpu.thread_id x + %active = arith.cmpi ult, %tid, %c32 : index + + // Spin loop: all 64 lanes participate so the shuffle stays uniform. + // Lanes 32..63 contribute a poison value to the shuffle (shfl reads + // lane 31, so their input is irrelevant) and do no memory work. + // The loop's exit predicate is wave-uniform (flag is a broadcast), + // so all lanes break together. + %final_v = scf.while (%dummy = %c0_i32) : (i32) -> i32 { + %v = scf.if %active -> i32 { + %loaded = memref.load %data[%tid] : memref<32xi32> + scf.yield %loaded : i32 + } else { + scf.yield %c0_i32 : i32 + } + %flag, %valid = gpu.shuffle idx %v, %c31_i32, %c64_i32 : i32 + %not_ready = arith.cmpi ne, %flag, %c1_i32 : i32 + scf.condition(%not_ready) %v : i32 + } do { + ^bb0(%v_iter : i32): + scf.yield %v_iter : i32 + } + + scf.if %active { + memref.store %final_v, %verify_buf[%tid] : memref<32xi32> + } + gpu.return + } + } + + // ---- Helpers ---------------------------------------------------------- + // Single ABI-leaking helper: wrap a raw runtime !llvm.ptr as a 1-D byte + // memref. All typed views below derive from this via memref.view, so the + // hand-built LLVM-struct descriptor literal lives in exactly one place. + // Phase 4's AIRSymmetricAllocToMgpuPass will replace this entirely. + func.func private @wrap_bytes(%ptr : !llvm.ptr, %size : i64) -> memref { + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %d0 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %d1 = llvm.insertvalue %ptr, %d0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %d2 = llvm.insertvalue %ptr, %d1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %d3 = llvm.insertvalue %c0_i64, %d2[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %d4 = llvm.insertvalue %size, %d3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %d5 = llvm.insertvalue %c1_i64, %d4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %m = builtin.unrealized_conversion_cast %d5 + : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref + return %m : memref + } + + // ---- main ------------------------------------------------------------ + func.func @main() { + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c0_i64 = arith.constant 0 : i64 + %c128_bytes = arith.constant 128 : i64 // 32 i32 = one cache line + %heap_size = arith.constant 268435456 : i64 // 256 MB + %nullptr = llvm.mlir.zero : !llvm.ptr + %false = arith.constant false + + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + + // Heap init (collective). + func.call @mgpuSymmetricHeapInit(%heap_size) : (i64) -> () + %rank = func.call @mgpuGetRank() : () -> i32 + %world = func.call @mgpuGetWorldSize() : () -> i32 + %fmt_init = llvm.mlir.addressof @msg_init : !llvm.ptr + llvm.call @printf(%fmt_init, %rank, %world) + vararg(!llvm.func) : (!llvm.ptr, i32, i32) -> i32 + + // Single 128-byte symmetric allocation (32 i32 = one cache line). + %data_ptr = func.call @mgpuSymmetricAlloc(%c128_bytes, %nullptr) + : (i64, !llvm.ptr) -> !llvm.ptr + + // Zero-init data from host so the consumer's spin starts seeing flag=0 + // (and so the validation can distinguish "never written" from "wrote 0"). + %data_host = memref.alloc() : memref<32xi32> + %dc0 = arith.constant 0 : index + %dc1 = arith.constant 1 : index + %dc32 = arith.constant 32 : index + scf.for %i = %dc0 to %dc32 step %dc1 { + memref.store %c0_i32, %data_host[%i] : memref<32xi32> + } + %data_host_intptr = memref.extract_aligned_pointer_as_index %data_host + : memref<32xi32> -> index + %data_host_int = arith.index_cast %data_host_intptr : index to i64 + %data_host_ptr = llvm.inttoptr %data_host_int : i64 to !llvm.ptr + func.call @mgpuMemcpy(%data_ptr, %data_host_ptr, %c128_bytes, %nullptr) + : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + memref.dealloc %data_host : memref<32xi32> + + func.call @mgpuBarrier() : () -> () // zero-init visible to all ranks + + %c0_view = arith.constant 0 : index + %data_bytes = func.call @wrap_bytes(%data_ptr, %c128_bytes) + : (!llvm.ptr, i64) -> memref + %data_m = memref.view %data_bytes[%c0_view][] + : memref to memref<32xi32> + + // mgpuGetHeapBases() returns a HOST pointer; GPU can't deref it, so + // copy to device. TODO(airgpu): make heap_bases device-accessible + // (hipMallocManaged / hipHostMalloc-Mapped) and drop this copy. + %world_i64 = arith.extui %world : i32 to i64 + %c8_i64 = arith.constant 8 : i64 + %bases_size = arith.muli %world_i64, %c8_i64 : i64 + %bases_host = func.call @mgpuGetHeapBases() : () -> !llvm.ptr + %bases_devptr = func.call @mgpuMemAlloc(%bases_size, %nullptr, %false) + : (i64, !llvm.ptr, i1) -> !llvm.ptr + func.call @mgpuMemcpy(%bases_devptr, %bases_host, %bases_size, %nullptr) + : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + %bases_bytes = func.call @wrap_bytes(%bases_devptr, %bases_size) + : (!llvm.ptr, i64) -> memref + %world_idx = arith.index_cast %world_i64 : i64 to index + %bases = memref.view %bases_bytes[%c0_view][%world_idx] + : memref to memref + + // Rank 0 = producer, rank 1 = consumer. Ranks > 1 idle. + // (Future: extend to all-pairs producer/consumer mesh.) + // Precondition: world >= 2 — enforced by run.sh, not re-checked here. + %is_producer = arith.cmpi eq, %rank, %c0_i32 : i32 + scf.if %is_producer { + gpu.launch_func @sym_kernels::@producer + blocks in (%c1, %c1, %c1) + threads in (%c64, %c1, %c1) + args(%data_m : memref<32xi32>, + %bases : memref) + %fmt_p = llvm.mlir.addressof @msg_pass_p : !llvm.ptr + llvm.call @printf(%fmt_p) + vararg(!llvm.func) : (!llvm.ptr) -> i32 + } else { + %is_consumer = arith.cmpi eq, %rank, %c1_i32 : i32 + scf.if %is_consumer { + %verify_ptr = func.call @mgpuMemAlloc(%c128_bytes, %nullptr, %false) + : (i64, !llvm.ptr, i1) -> !llvm.ptr + %verify_bytes = func.call @wrap_bytes(%verify_ptr, %c128_bytes) + : (!llvm.ptr, i64) -> memref + %verify_m = memref.view %verify_bytes[%c0_view][] + : memref to memref<32xi32> + gpu.launch_func @sym_kernels::@consumer + blocks in (%c1, %c1, %c1) + threads in (%c64, %c1, %c1) + args(%data_m : memref<32xi32>, + %verify_m: memref<32xi32>) + + // D2H readback verify_buf and check all 32 ints: + // verify[i] == i + 100 for i in 0..30, + // verify[31] == 1 (flag). + %hb = memref.alloc() : memref<32xi32> + %hb_intptr = memref.extract_aligned_pointer_as_index %hb + : memref<32xi32> -> index + %hb_int = arith.index_cast %hb_intptr : index to i64 + %hb_ptr = llvm.inttoptr %hb_int : i64 to !llvm.ptr + func.call @mgpuMemcpy(%hb_ptr, %verify_ptr, %c128_bytes, %nullptr) + : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + + %c0_idx = arith.constant 0 : index + %c1_idx = arith.constant 1 : index + %c31_idx = arith.constant 31 : index + %c32_idx = arith.constant 32 : index + %c100_i32 = arith.constant 100 : i32 + + // Count mismatches; print msg_fail on the first. + %nfail = scf.for %i = %c0_idx to %c32_idx step %c1_idx + iter_args(%nfail_acc = %c0_i32) -> (i32) { + %v = memref.load %hb[%i] : memref<32xi32> + %is_flag_idx = arith.cmpi eq, %i, %c31_idx : index + %expected = scf.if %is_flag_idx -> i32 { + scf.yield %c1_i32 : i32 + } else { + %i_i32 = arith.index_cast %i : index to i32 + %e = arith.addi %i_i32, %c100_i32 : i32 + scf.yield %e : i32 + } + %ne = arith.cmpi ne, %v, %expected : i32 + %new_nfail = scf.if %ne -> i32 { + %is_first = arith.cmpi eq, %nfail_acc, %c0_i32 : i32 + scf.if %is_first { + %fmt_fail = llvm.mlir.addressof @msg_fail : !llvm.ptr + %i_i64 = arith.index_cast %i : index to i64 + llvm.call @printf(%fmt_fail, %rank, %i_i64, %v, %expected) + vararg(!llvm.func) + : (!llvm.ptr, i32, i64, i32, i32) -> i32 + } + %inc = arith.addi %nfail_acc, %c1_i32 : i32 + scf.yield %inc : i32 + } else { + scf.yield %nfail_acc : i32 + } + scf.yield %new_nfail : i32 + } + + %ok_all = arith.cmpi eq, %nfail, %c0_i32 : i32 + scf.if %ok_all { + %fmt_c = llvm.mlir.addressof @msg_pass_c : !llvm.ptr + %v0 = memref.load %hb[%c0_idx] : memref<32xi32> + %vf = memref.load %hb[%c31_idx] : memref<32xi32> + llvm.call @printf(%fmt_c, %v0, %vf) + vararg(!llvm.func) : (!llvm.ptr, i32, i32) -> i32 + } else { + func.call @exit(%c1_i32) : (i32) -> () + } + + memref.dealloc %hb : memref<32xi32> + func.call @mgpuMemFree(%verify_ptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () + } + } + + func.call @mgpuBarrier() : () -> () + func.call @mgpuMemFree(%bases_devptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () + func.call @mgpuSymmetricFree(%data_ptr, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () + func.call @mgpuSymmetricHeapDestroy() : () -> () + + %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr + llvm.call @printf(%fmt_done, %rank) + vararg(!llvm.func) : (!llvm.ptr, i32) -> i32 + + return + } +} diff --git a/test/gpu/symmetric_heap_dma/air_sym_with_alloc.mlir b/test/gpu/symmetric_heap_dma/air_sym_with_alloc.mlir new file mode 100644 index 000000000..5b0e892e3 --- /dev/null +++ b/test/gpu/symmetric_heap_dma/air_sym_with_alloc.mlir @@ -0,0 +1,122 @@ +//===- air_sym_with_alloc.mlir - air.rank + memref.alloc air.symmetric e2e ===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// +// +// Variant of air_sym_with_rank.mlir that uses `memref.alloc {air.symmetric}` +// instead of a direct call to `mgpuSymmetricAlloc`. Exercises Phase 3 +// (`air-rank-to-mgpu`) AND Phase 4 (`air-symmetric-alloc-to-mgpu`). +// +// The symmetric memref is wrapped/unwrapped via the standard +// `memref.extract_aligned_pointer_as_index` -> `llvm.inttoptr` idiom to +// recover the !llvm.ptr that the runtime ABI expects. +// +//===-----------------------------------------------------------------------===// + +module { + func.func private @mgpuGetWorldSize() -> i32 + func.func private @mgpuGetHeapBases() -> !llvm.ptr + func.func private @mgpuBarrier() + func.func private @mgpuMemAlloc(i64, !llvm.ptr, i1) -> !llvm.ptr + func.func private @mgpuMemFree(!llvm.ptr, !llvm.ptr) + func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) + func.func private @malloc(i64) -> !llvm.ptr + func.func private @free(!llvm.ptr) + llvm.func @printf(!llvm.ptr, ...) -> i32 + + llvm.mlir.global internal constant @msg_pass("[mlir/alloc] rank %d: cross-rank read PASS (peer=%d, expected=%.1f)\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_only1("[mlir/alloc] rank %d: world_size=1, skipping cross-rank read\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_done("[mlir/alloc] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32} + + func.func @main() { + %c2 = arith.constant 2 : index + + air.rank (%rid) in (%rsize = %c2) { + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c4096_i64 = arith.constant 4096 : i64 + %nullptr = llvm.mlir.zero : !llvm.ptr + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1024 = arith.constant 1024 : index + + %rid_i64 = arith.index_cast %rid : index to i64 + %rid_i32 = arith.trunci %rid_i64 : i64 to i32 + %rsize_i64 = arith.index_cast %rsize : index to i64 + %rsize_i32 = arith.trunci %rsize_i64 : i64 to i32 + + // === Phase 4 lowering target: memref.alloc {air.symmetric} === + %buf_memref = memref.alloc() {air.symmetric} : memref<1024xf32> + + // Extract the underlying pointer for use with the mgpu* runtime ABI. + // (Symmetric heap memory is GPU-only; CPU writes go through mgpuMemcpy.) + %intptr = memref.extract_aligned_pointer_as_index %buf_memref + : memref<1024xf32> -> index + %buf_int = arith.index_cast %intptr : index to i64 + %buf = llvm.inttoptr %buf_int : i64 to !llvm.ptr + + // Fill (rid+1).0 from a host buffer via mgpuMemcpy H2D. + %r1_i32 = arith.addi %rid_i32, %c1_i32 : i32 + %r1_f = arith.sitofp %r1_i32 : i32 to f32 + %hostbuf = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr + scf.for %i = %c0 to %c1024 step %c1 { + %i_i64 = arith.index_cast %i : index to i64 + %addr = llvm.getelementptr %hostbuf[%i_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %r1_f, %addr : f32, !llvm.ptr + } + func.call @mgpuMemcpy(%buf, %hostbuf, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + func.call @free(%hostbuf) : (!llvm.ptr) -> () + + func.call @mgpuBarrier() : () -> () + + %is_multi = arith.cmpi sgt, %rsize_i32, %c1_i32 : i32 + scf.if %is_multi { + %sum = arith.addi %rid_i32, %c1_i32 : i32 + %peer_i32 = arith.remsi %sum, %rsize_i32 : i32 + %bases = func.call @mgpuGetHeapBases() : () -> !llvm.ptr + %peer_i64 = arith.extsi %peer_i32 : i32 to i64 + %peer_base_addr = llvm.getelementptr %bases[%peer_i64] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr + %peer_base = llvm.load %peer_base_addr : !llvm.ptr -> !llvm.ptr + %local_base_addr = llvm.getelementptr %bases[%rid_i64] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr + %local_base = llvm.load %local_base_addr : !llvm.ptr -> !llvm.ptr + %lb_int = llvm.ptrtoint %local_base : !llvm.ptr to i64 + %offset = arith.subi %buf_int, %lb_int : i64 + %peer_buf = llvm.getelementptr %peer_base[%offset] : (!llvm.ptr, i64) -> !llvm.ptr, i8 + + %local_copy = func.call @mgpuMemAlloc(%c4096_i64, %nullptr, %false) : (i64, !llvm.ptr, i1) -> !llvm.ptr + func.call @mgpuMemcpy(%local_copy, %peer_buf, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + %host_rb = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr + func.call @mgpuMemcpy(%host_rb, %local_copy, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + + %p1_i32 = arith.addi %peer_i32, %c1_i32 : i32 + %expected = arith.sitofp %p1_i32 : i32 to f32 + %c0_i64 = arith.constant 0 : i64 + %addr0 = llvm.getelementptr %host_rb[%c0_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %v0 = llvm.load %addr0 : !llvm.ptr -> f32 + %ok = arith.cmpf oeq, %v0, %expected : f32 + scf.if %ok { + %fmt = llvm.mlir.addressof @msg_pass : !llvm.ptr + %e64 = arith.extf %expected : f32 to f64 + llvm.call @printf(%fmt, %rid_i32, %peer_i32, %e64) vararg(!llvm.func) : (!llvm.ptr, i32, i32, f64) -> i32 + } + + func.call @free(%host_rb) : (!llvm.ptr) -> () + func.call @mgpuMemFree(%local_copy, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () + } else { + %fmt = llvm.mlir.addressof @msg_only1 : !llvm.ptr + llvm.call @printf(%fmt, %rid_i32) vararg(!llvm.func) : (!llvm.ptr, i32) -> i32 + } + + func.call @mgpuBarrier() : () -> () + memref.dealloc %buf_memref : memref<1024xf32> + + %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr + llvm.call @printf(%fmt_done, %rid_i32) vararg(!llvm.func) : (!llvm.ptr, i32) -> i32 + air.rank_terminator + } + return + } +} diff --git a/test/gpu/symmetric_heap_dma/air_sym_with_channel.mlir b/test/gpu/symmetric_heap_dma/air_sym_with_channel.mlir new file mode 100644 index 000000000..3f421db7d --- /dev/null +++ b/test/gpu/symmetric_heap_dma/air_sym_with_channel.mlir @@ -0,0 +1,105 @@ +//===- air_sym_with_channel.mlir - air.channel gpu_symmetric_heap e2e ----===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// +// +// Highest-level form combining: +// - Phase 1: gpu_symmetric_heap channel_type, air.symmetric memref attribute +// - Phase 3: air-rank-to-mgpu (rank body inlining) +// - Phase 4: air-symmetric-alloc-to-mgpu (memref.alloc -> mgpuSymmetricAlloc) +// - Phase 6: air-gpu-channel-to-mgpu (gpu_symmetric_heap put/get -> peer-VA +// mgpuMemcpy + mgpuBarrier) +// +// Each rank fills a symmetric src buffer with (rank+1).0, publishes via +// air.channel.put, and reads rank 0's slot via air.channel.get into a local +// dst buffer. Both ranks should see 1.0 in dst[0]. +// +//===-----------------------------------------------------------------------===// + +module { + func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) + func.func private @malloc(i64) -> !llvm.ptr + func.func private @free(!llvm.ptr) + llvm.func @printf(!llvm.ptr, ...) -> i32 + + llvm.mlir.global internal constant @msg_pass("[mlir/chan] rank %d: channel get PASS (read rank 0 = %.1f)\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_done("[mlir/chan] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32} + + // Channel decl at module scope (Symbol). + air.channel @sym_chan [] {channel_type = "gpu_symmetric_heap"} + + func.func @main() { + %c2 = arith.constant 2 : index + + air.rank (%rid) in (%rsize = %c2) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1024 = arith.constant 1024 : index + %c1_i32 = arith.constant 1 : i32 + %c4096_i64 = arith.constant 4096 : i64 + %nullptr = llvm.mlir.zero : !llvm.ptr + + %rid_i64 = arith.index_cast %rid : index to i64 + %rid_i32 = arith.trunci %rid_i64 : i64 to i32 + + // Symmetric src buffer (each rank allocates same shape at same offset). + %src_buf = memref.alloc() {air.symmetric} : memref<1024xf32> + // Local non-symmetric destination. + %dst_buf = memref.alloc() {air.symmetric} : memref<1024xf32> + + // Fill src_buf with (rid+1).0 from host. + %r1_i32 = arith.addi %rid_i32, %c1_i32 : i32 + %r1_f = arith.sitofp %r1_i32 : i32 to f32 + %hostbuf = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr + scf.for %i = %c0 to %c1024 step %c1 { + %i_i64 = arith.index_cast %i : index to i64 + %addr = llvm.getelementptr %hostbuf[%i_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %r1_f, %addr : f32, !llvm.ptr + } + %src_intptr = memref.extract_aligned_pointer_as_index %src_buf + : memref<1024xf32> -> index + %src_int = arith.index_cast %src_intptr : index to i64 + %src_ptr = llvm.inttoptr %src_int : i64 to !llvm.ptr + func.call @mgpuMemcpy(%src_ptr, %hostbuf, %c4096_i64, %nullptr) + : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + + // === Phase 6 lowering target: gpu_symmetric_heap channel put/get === + // put publishes our src_buf; get reads peer (rank 0) into dst_buf. + air.channel.put @sym_chan[] (%src_buf[] [] []) : (memref<1024xf32>) + air.channel.get @sym_chan[%c0] (%dst_buf[] [] []) : (memref<1024xf32>) + + // Verify: D2H readback dst_buf to a host buffer, check element 0. + %dst_intptr = memref.extract_aligned_pointer_as_index %dst_buf + : memref<1024xf32> -> index + %dst_int = arith.index_cast %dst_intptr : index to i64 + %dst_ptr = llvm.inttoptr %dst_int : i64 to !llvm.ptr + %host_rb = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr + func.call @mgpuMemcpy(%host_rb, %dst_ptr, %c4096_i64, %nullptr) + : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + %c0_i64 = arith.constant 0 : i64 + %addr0 = llvm.getelementptr %host_rb[%c0_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %v0 = llvm.load %addr0 : !llvm.ptr -> f32 + %expected = arith.constant 1.0 : f32 + %ok = arith.cmpf oeq, %v0, %expected : f32 + scf.if %ok { + %fmt = llvm.mlir.addressof @msg_pass : !llvm.ptr + %v0_64 = arith.extf %v0 : f32 to f64 + llvm.call @printf(%fmt, %rid_i32, %v0_64) vararg(!llvm.func) + : (!llvm.ptr, i32, f64) -> i32 + } + func.call @free(%host_rb) : (!llvm.ptr) -> () + func.call @free(%hostbuf) : (!llvm.ptr) -> () + + memref.dealloc %dst_buf : memref<1024xf32> + memref.dealloc %src_buf : memref<1024xf32> + + %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr + llvm.call @printf(%fmt_done, %rid_i32) vararg(!llvm.func) + : (!llvm.ptr, i32) -> i32 + air.rank_terminator + } + return + } +} diff --git a/test/gpu/symmetric_heap_dma/air_sym_with_dma.mlir b/test/gpu/symmetric_heap_dma/air_sym_with_dma.mlir new file mode 100644 index 000000000..c5d2d9413 --- /dev/null +++ b/test/gpu/symmetric_heap_dma/air_sym_with_dma.mlir @@ -0,0 +1,109 @@ +//===- air_sym_with_dma.mlir - air.rank + air.dma_memcpy_nd cross-rank ----===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// +// +// Highest-level form of the symmetric-heap test. Combines: +// - Phase 1: air.symmetric memref attribute, src_rank attribute on +// air.dma_memcpy_nd +// - Phase 3: air-rank-to-mgpu (rank body inlining) +// - Phase 4: air-symmetric-alloc-to-mgpu (memref.alloc -> mgpuSymmetricAlloc) +// - Phase 5: air-cross-rank-dma-to-mgpu (cross-rank dma -> peer-VA mgpuMemcpy) +// +// Each rank allocates two symmetric buffers (src and dst), fills its src with +// (rank+1).0, then issues a cross-rank DMA reading rank 0's src into its +// own dst, and verifies dst contains 1.0 on every rank. +// +//===-----------------------------------------------------------------------===// + +module { + func.func private @mgpuBarrier() + func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) + func.func private @malloc(i64) -> !llvm.ptr + func.func private @free(!llvm.ptr) + llvm.func @printf(!llvm.ptr, ...) -> i32 + + llvm.mlir.global internal constant @msg_pass("[mlir/dma] rank %d: cross-rank DMA PASS (read rank 0 = %.1f)\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_done("[mlir/dma] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32} + + func.func @main() { + %c2 = arith.constant 2 : index + + air.rank (%rid) in (%rsize = %c2) { + %c1_i32 = arith.constant 1 : i32 + %c4096_i64 = arith.constant 4096 : i64 + %nullptr = llvm.mlir.zero : !llvm.ptr + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1024 = arith.constant 1024 : index + + %rid_i64 = arith.index_cast %rid : index to i64 + %rid_i32 = arith.trunci %rid_i64 : i64 to i32 + + // Two symmetric buffers per rank (collective allocation). + %src_buf = memref.alloc() {air.symmetric} : memref<1024xf32> + %dst_buf = memref.alloc() {air.symmetric} : memref<1024xf32> + + // Get pointers for the H2D init (and later D2H verification). + %src_intptr = memref.extract_aligned_pointer_as_index %src_buf + : memref<1024xf32> -> index + %src_int = arith.index_cast %src_intptr : index to i64 + %src_ptr = llvm.inttoptr %src_int : i64 to !llvm.ptr + + %dst_intptr = memref.extract_aligned_pointer_as_index %dst_buf + : memref<1024xf32> -> index + %dst_int = arith.index_cast %dst_intptr : index to i64 + %dst_ptr = llvm.inttoptr %dst_int : i64 to !llvm.ptr + + // Fill src_buf with (rid+1).0 via host buffer + mgpuMemcpy H2D. + %r1_i32 = arith.addi %rid_i32, %c1_i32 : i32 + %r1_f = arith.sitofp %r1_i32 : i32 to f32 + %hostbuf = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr + scf.for %i = %c0 to %c1024 step %c1 { + %i_i64 = arith.index_cast %i : index to i64 + %addr = llvm.getelementptr %hostbuf[%i_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %r1_f, %addr : f32, !llvm.ptr + } + func.call @mgpuMemcpy(%src_ptr, %hostbuf, %c4096_i64, %nullptr) + : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + func.call @mgpuBarrier() : () -> () + + // === Phase 5 lowering target: cross-rank air.dma_memcpy_nd === + // Both ranks read from rank 0's src_buf into their own dst_buf. + air.dma_memcpy_nd (%dst_buf[] [] [], %src_buf[] [] []) + {src_rank = 0 : i64} + : (memref<1024xf32>, memref<1024xf32>) + + // Verify: D2H readback dst_buf to a host buffer, check element 0. + // On every rank, dst_buf should contain (rank0 + 1).0 == 1.0. + %host_rb = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr + func.call @mgpuMemcpy(%host_rb, %dst_ptr, %c4096_i64, %nullptr) + : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + %c0_i64 = arith.constant 0 : i64 + %addr0 = llvm.getelementptr %host_rb[%c0_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %v0 = llvm.load %addr0 : !llvm.ptr -> f32 + %expected = arith.constant 1.0 : f32 + %ok = arith.cmpf oeq, %v0, %expected : f32 + scf.if %ok { + %fmt = llvm.mlir.addressof @msg_pass : !llvm.ptr + %v0_64 = arith.extf %v0 : f32 to f64 + llvm.call @printf(%fmt, %rid_i32, %v0_64) vararg(!llvm.func) + : (!llvm.ptr, i32, f64) -> i32 + } + func.call @free(%host_rb) : (!llvm.ptr) -> () + + func.call @mgpuBarrier() : () -> () + func.call @free(%hostbuf) : (!llvm.ptr) -> () + memref.dealloc %dst_buf : memref<1024xf32> + memref.dealloc %src_buf : memref<1024xf32> + + %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr + llvm.call @printf(%fmt_done, %rid_i32) vararg(!llvm.func) + : (!llvm.ptr, i32) -> i32 + air.rank_terminator + } + return + } +} diff --git a/test/gpu/symmetric_heap_dma/air_sym_with_rank.mlir b/test/gpu/symmetric_heap_dma/air_sym_with_rank.mlir new file mode 100644 index 000000000..cf5416347 --- /dev/null +++ b/test/gpu/symmetric_heap_dma/air_sym_with_rank.mlir @@ -0,0 +1,122 @@ +//===- air_sym_with_rank.mlir - High-level air.rank multi-GPU e2e --------===// +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===-----------------------------------------------------------------------===// +// +// Higher-level version of air_sym_handwritten.mlir that uses `air.rank` to +// express the multi-process world. The `air-rank-to-mgpu` pass lowers +// air.rank to inline body + mgpuGetRank() / mgpuSymmetricHeapInit / Destroy. +// +// Once lowered, the IR matches air_sym_handwritten.mlir's behavior. After +// `mlir-opt --pass-pipeline=...`, both forms should run identically under +// the multi-process driver run.sh. +// +//===-----------------------------------------------------------------------===// + +module { + // ---- mgpu* C ABI declarations -------------------------------------- + func.func private @mgpuGetRank() -> i32 + func.func private @mgpuGetWorldSize() -> i32 + func.func private @mgpuSymmetricAlloc(i64, !llvm.ptr) -> !llvm.ptr + func.func private @mgpuSymmetricFree(!llvm.ptr, !llvm.ptr) + func.func private @mgpuGetHeapBases() -> !llvm.ptr + func.func private @mgpuBarrier() + func.func private @mgpuMemAlloc(i64, !llvm.ptr, i1) -> !llvm.ptr + func.func private @mgpuMemFree(!llvm.ptr, !llvm.ptr) + func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) + + // libc helpers + func.func private @malloc(i64) -> !llvm.ptr + func.func private @free(!llvm.ptr) + llvm.func @printf(!llvm.ptr, ...) -> i32 + + llvm.mlir.global internal constant @msg_pass("[mlir/rank] rank %d: cross-rank read PASS (peer=%d, expected=%.1f)\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_only1("[mlir/rank] rank %d: world_size=1, skipping cross-rank read\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @msg_done("[mlir/rank] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32} + + func.func @main() { + %c2 = arith.constant 2 : index + + // High-level: a 2-rank world. The body executes once per rank. + air.rank (%rid) in (%rsize = %c2) { + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c4096_i64 = arith.constant 4096 : i64 + %nullptr = llvm.mlir.zero : !llvm.ptr + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1024 = arith.constant 1024 : index + + // Convert rank id (index) to i32 for printf and arithmetic. + %rid_i64 = arith.index_cast %rid : index to i64 + %rid_i32 = arith.trunci %rid_i64 : i64 to i32 + %rsize_i64 = arith.index_cast %rsize : index to i64 + %rsize_i32 = arith.trunci %rsize_i64 : i64 to i32 + + %buf = func.call @mgpuSymmetricAlloc(%c4096_i64, %nullptr) : (i64, !llvm.ptr) -> !llvm.ptr + + // Fill buf with (rank+1).0 from host + %hostbuf = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr + %r1_i32 = arith.addi %rid_i32, %c1_i32 : i32 + %r1_f = arith.sitofp %r1_i32 : i32 to f32 + scf.for %i = %c0 to %c1024 step %c1 { + %i_i64 = arith.index_cast %i : index to i64 + %addr = llvm.getelementptr %hostbuf[%i_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %r1_f, %addr : f32, !llvm.ptr + } + func.call @mgpuMemcpy(%buf, %hostbuf, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + func.call @mgpuBarrier() : () -> () + + %is_multi = arith.cmpi sgt, %rsize_i32, %c1_i32 : i32 + scf.if %is_multi { + %sum = arith.addi %rid_i32, %c1_i32 : i32 + %peer_i32 = arith.remsi %sum, %rsize_i32 : i32 + %bases = func.call @mgpuGetHeapBases() : () -> !llvm.ptr + %peer_i64 = arith.extsi %peer_i32 : i32 to i64 + %peer_base_addr = llvm.getelementptr %bases[%peer_i64] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr + %peer_base = llvm.load %peer_base_addr : !llvm.ptr -> !llvm.ptr + %local_base_addr = llvm.getelementptr %bases[%rid_i64] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr + %local_base = llvm.load %local_base_addr : !llvm.ptr -> !llvm.ptr + %buf_int = llvm.ptrtoint %buf : !llvm.ptr to i64 + %lb_int = llvm.ptrtoint %local_base : !llvm.ptr to i64 + %offset = arith.subi %buf_int, %lb_int : i64 + %peer_buf = llvm.getelementptr %peer_base[%offset] : (!llvm.ptr, i64) -> !llvm.ptr, i8 + + %local_copy = func.call @mgpuMemAlloc(%c4096_i64, %nullptr, %false) : (i64, !llvm.ptr, i1) -> !llvm.ptr + func.call @mgpuMemcpy(%local_copy, %peer_buf, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + %host_rb = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr + func.call @mgpuMemcpy(%host_rb, %local_copy, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () + + %p1_i32 = arith.addi %peer_i32, %c1_i32 : i32 + %expected = arith.sitofp %p1_i32 : i32 to f32 + %c0_i64 = arith.constant 0 : i64 + %addr0 = llvm.getelementptr %host_rb[%c0_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %v0 = llvm.load %addr0 : !llvm.ptr -> f32 + %ok = arith.cmpf oeq, %v0, %expected : f32 + scf.if %ok { + %fmt = llvm.mlir.addressof @msg_pass : !llvm.ptr + %e64 = arith.extf %expected : f32 to f64 + llvm.call @printf(%fmt, %rid_i32, %peer_i32, %e64) vararg(!llvm.func) : (!llvm.ptr, i32, i32, f64) -> i32 + } + + func.call @free(%host_rb) : (!llvm.ptr) -> () + func.call @mgpuMemFree(%local_copy, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () + } else { + %fmt = llvm.mlir.addressof @msg_only1 : !llvm.ptr + llvm.call @printf(%fmt, %rid_i32) vararg(!llvm.func) : (!llvm.ptr, i32) -> i32 + } + + func.call @mgpuBarrier() : () -> () + func.call @free(%hostbuf) : (!llvm.ptr) -> () + func.call @mgpuSymmetricFree(%buf, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () + + %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr + llvm.call @printf(%fmt_done, %rid_i32) vararg(!llvm.func) : (!llvm.ptr, i32) -> i32 + air.rank_terminator + } + return + } +} diff --git a/test/gpu/symmetric_heap_dma/run.sh b/test/gpu/symmetric_heap_dma/run.sh new file mode 100755 index 000000000..9067bc841 --- /dev/null +++ b/test/gpu/symmetric_heap_dma/run.sh @@ -0,0 +1,162 @@ +#!/usr/bin/env bash +#===- run.sh - Multi-process symmetric-heap DMA e2e test --*- +# +# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +# +#===------------------------------------------------------------------===// +# +# Compile and run the hand-written symmetric-heap MLIR test as N processes. +# Each process executes the full IR; processes coordinate via the symmetric +# heap (XGMI peer-mapped VMem buffers). +# +# Usage: run.sh [num_ranks] (default: 2) +# +# Required environment (auto-detected when sourced via env_setup_gpu.sh): +# MLIR_AIR_INSTALL_DIR - path containing lib/libairgpu.so +# LLVM_INSTALL_DIR - path containing bin/mlir-opt + lib/libmlir_*.so +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +NUM_RANKS=${1:-2} +TMPDIR="${TMPDIR:-/tmp/air_sym_dma}" +mkdir -p "$TMPDIR" + +# Cross-rank symmetric-heap test fundamentally requires a producer + a +# consumer process. Refuse single-process launches loudly rather than +# letting the kernel silently no-op or hang. +if [ "$NUM_RANKS" -lt 2 ]; then + echo "ERROR: NUM_RANKS=$NUM_RANKS; this test requires >= 2 ranks (producer + consumer)." >&2 + exit 1 +fi + +# Refuse to run if there aren't enough physically distinct GPUs for one +# rank per GPU. Colocating ranks on a single GPU would make XGMI/peer-VA +# transparently fall back to local memory and produce false-positive PASSes. +if [ -n "${HIP_VISIBLE_DEVICES:-}" ]; then + NUM_GPUS=$(echo "$HIP_VISIBLE_DEVICES" | tr ',' '\n' | grep -c .) +else + NUM_GPUS=$(grep -l '^simd_count [1-9]' /sys/class/kfd/kfd/topology/nodes/*/properties 2>/dev/null | wc -l) +fi +if [ "$NUM_GPUS" -lt "$NUM_RANKS" ]; then + echo "ERROR: need >= $NUM_RANKS GPUs to validate cross-rank XGMI traffic; found $NUM_GPUS." >&2 + echo " This test refuses to colocate ranks on a single GPU because it would" >&2 + echo " silently bypass the symmetric-heap path and report false PASSes." >&2 + exit 1 +fi + +LLVM_LIB_DIR="${LLVM_INSTALL_DIR:-$(dirname "$(which mlir-opt)")/..}/lib" +AIRGPU_LIB="${MLIR_AIR_INSTALL_DIR:-$(dirname "$(which air-opt)")/..}/lib/libairgpu.so" + +# Input MLIR can be selected via INPUT env var. +# atomic — kernel-driven producer/consumer, LLVM atomicrmw + atomic +# load with syncscope("") (Phase 2) +# cacheline — kernel-driven producer/consumer, cache-line atomicity + +# gpu.shuffle (Phase 2) +# rank — high-level air.rank form (Phase 3) +INPUT="${INPUT:-cacheline}" +case "$INPUT" in + atomic|cacheline) + # Kernel-driven test: needs the full GPU compilation chain + # (rocdl-attach-target → convert-gpu-to-rocdl → gpu-module-to-binary). + SRC_MLIR="$SCRIPT_DIR/air_sym_handwritten_${INPUT}.mlir" + echo "Step 1a: Expand air.translate ops ($INPUT variant)" + air-opt "$SRC_MLIR" --air-translate-to-llvm \ + -o "$TMPDIR/sym_post_translate.mlir" + echo "Step 1b: Compile gpu.module to AMDGPU binary + finalize host" + mlir-opt "$TMPDIR/sym_post_translate.mlir" \ + --pass-pipeline='builtin.module(rocdl-attach-target{chip=gfx942 O=3},gpu.module(convert-scf-to-cf,convert-gpu-to-rocdl{chipset=gfx942 runtime=HIP},reconcile-unrealized-casts),gpu-module-to-binary,func.func(gpu-async-region,convert-scf-to-cf),gpu-to-llvm,convert-to-llvm,reconcile-unrealized-casts)' \ + -o "$TMPDIR/sym_lowered.mlir" + SKIP_LOWER=1 + ;; + rank) + # Host-orchestrated test: simple LLVM-only pipeline. + echo "Step 1a: Lower air.rank to mgpu*" + air-opt "$SCRIPT_DIR/air_sym_with_rank.mlir" -air-rank-to-mgpu \ + -o "$TMPDIR/post_rank.mlir" + echo "Step 1b: Lower IR to LLVM dialect" + mlir-opt "$TMPDIR/post_rank.mlir" \ + --pass-pipeline='builtin.module(func.func(convert-scf-to-cf),convert-to-llvm,reconcile-unrealized-casts)' \ + -o "$TMPDIR/sym_lowered.mlir" + SKIP_LOWER=1 + ;; + alloc) + SRC="$SCRIPT_DIR/air_sym_with_alloc.mlir" + # Phase 4 alloc lowering, then Phase 3 rank lowering, then standard LLVM. + air-opt "$SRC" -air-symmetric-alloc-to-mgpu -air-rank-to-mgpu \ + -o "$TMPDIR/post_phase4.mlir" + SRC="$TMPDIR/post_phase4.mlir" + PIPE='builtin.module(func.func(convert-scf-to-cf),convert-to-llvm,reconcile-unrealized-casts)' + ;; + dma) + SRC="$SCRIPT_DIR/air_sym_with_dma.mlir" + # Phase 5 cross-rank DMA, Phase 4 alloc, Phase 3 rank, then standard LLVM. + air-opt "$SRC" -air-cross-rank-dma-to-mgpu -air-symmetric-alloc-to-mgpu \ + -air-rank-to-mgpu -o "$TMPDIR/post_phase5.mlir" + SRC="$TMPDIR/post_phase5.mlir" + PIPE='builtin.module(func.func(convert-scf-to-cf),convert-to-llvm,reconcile-unrealized-casts)' + ;; + channel) + SRC="$SCRIPT_DIR/air_sym_with_channel.mlir" + # Phase 6 channel, Phase 4 alloc, Phase 3 rank, then standard LLVM. + air-opt "$SRC" -air-gpu-channel-to-mgpu -air-symmetric-alloc-to-mgpu \ + -air-rank-to-mgpu -o "$TMPDIR/post_phase6.mlir" + SRC="$TMPDIR/post_phase6.mlir" + PIPE='builtin.module(func.func(convert-scf-to-cf),convert-to-llvm,reconcile-unrealized-casts)' + ;; + prelowered) + # Pre-lowered MLIR file (e.g., output of `aircc --multi-gpu`). + # Path provided via SRC=path env var; bypass step 1. + if [ -z "${SRC:-}" ]; then + echo "INPUT=prelowered requires SRC=" >&2 + exit 1 + fi + cp "$SRC" "$TMPDIR/sym_lowered.mlir" + SKIP_LOWER=1 + ;; + *) + echo "Unknown INPUT=$INPUT; expected 'atomic', 'cacheline', 'rank', 'alloc', 'dma', 'channel', or 'prelowered'" >&2; exit 1;; +esac + +if [ -z "${SKIP_LOWER:-}" ]; then + echo "Step 1c: Lower IR to LLVM dialect (INPUT=$INPUT)" + mlir-opt "$SRC" --pass-pipeline="$PIPE" -o "$TMPDIR/sym_lowered.mlir" +fi + +echo "Step 2: Run as ${NUM_RANKS} processes" +export AIRGPU_JOB_ID="${AIRGPU_JOB_ID:-$$}" + +PIDS=() +PASS=1 + +for i in $(seq 0 $((NUM_RANKS - 1))); do + (set -o pipefail + # Pin each process to its own GPU at the OS / HIP-visibility level. + # mlir-runner's built-in gpu.launch_func handler (and any nested call + # into libmlir_rocm_runtime.so) only ever sees one device, so it can't + # accidentally launch on the wrong one. Every rank still sees device 0 + # internally, so airgpu uses LOCAL_RANK=0. + RANK=$i WORLD_SIZE=$NUM_RANKS LOCAL_RANK=0 HIP_VISIBLE_DEVICES=$i \ + mlir-runner --entry-point-result=void \ + --shared-libs="$LLVM_LIB_DIR/libmlir_rocm_runtime.so" \ + --shared-libs="$AIRGPU_LIB" \ + --shared-libs="$LLVM_LIB_DIR/libmlir_runner_utils.so" \ + --shared-libs="$LLVM_LIB_DIR/libmlir_c_runner_utils.so" \ + "$TMPDIR/sym_lowered.mlir" 2>&1 | sed "s/^/[rank $i] /") & + PIDS+=($!) +done + +for pid in "${PIDS[@]}"; do + if ! wait "$pid"; then + PASS=0 + fi +done + +if [ $PASS -eq 1 ]; then + echo "=== ALL ${NUM_RANKS} RANKS PASSED ===" +else + echo "=== SOME RANKS FAILED ===" + exit 1 +fi diff --git a/tools/aircc/aircc.cpp b/tools/aircc/aircc.cpp index 8bb7fbad5..3401afb51 100644 --- a/tools/aircc/aircc.cpp +++ b/tools/aircc/aircc.cpp @@ -179,6 +179,16 @@ static cl::opt cl::desc("GPU runtime for ROCDL target (HIP or OpenCL)"), cl::init("HIP"), cl::cat(airCompilerOptions)); +static cl::opt multiGpu( + "multi-gpu", + cl::desc( + "When --target=gpu, lower air.rank / air.symmetric memref / cross-rank " + "air.dma_memcpy_nd / gpu_symmetric_heap air.channel ops to mgpu* " + "runtime calls. Produces host-only LLVM IR; the result must be run " + "as N processes (RANK / WORLD_SIZE / LOCAL_RANK env vars) linked " + "against libairgpu.so. See test/gpu/symmetric_heap_dma/run.sh."), + cl::init(false), cl::cat(airCompilerOptions)); + static cl::opt omitWhileTrueLoop("omit-while-true-loop", cl::desc("Do not add while(true) loop around per-core " @@ -707,6 +717,72 @@ static OwningOpRef cloneModule(ModuleOp moduleOp) { // GPU Compilation Pipeline //===----------------------------------------------------------------------===// +// Multi-GPU host-only compilation pipeline. Lowers the high-level multi-GPU +// abstractions (air.rank, air.symmetric memref, cross-rank air.dma_memcpy_nd, +// gpu_symmetric_heap air.channel) to mgpu* runtime calls + standard LLVM. +// Output is host-only LLVM IR meant to be run as N processes via mlir-runner +// with RANK / WORLD_SIZE / LOCAL_RANK env vars set. +static LogicalResult runMultiGpuCompilation() { + SmallString<256> baseName(sys::path::stem(inputFilename)); + + auto airOpt = sys::findProgramByName("air-opt"); + auto mlirOpt = sys::findProgramByName("mlir-opt"); + if (!airOpt) { + llvm::errs() << "Error: could not find air-opt in PATH\n"; + return failure(); + } + if (!mlirOpt) { + llvm::errs() << "Error: could not find mlir-opt in PATH\n"; + return failure(); + } + + if (verbose) { + llvm::outs() << "Multi-GPU compilation for " << inputFilename << "\n"; + llvm::outs() << " Tmpdir: " << tmpDir << "\n"; + } + + // Step 1: Lower multi-GPU abstractions to mgpu* runtime calls. + // Order: cross-rank-DMA / channel first (they reference air.symmetric + // allocs that survive Phase 4), then symmetric-alloc, then rank. + SmallString<256> step1(tmpDir); + sys::path::append(step1, baseName + "_mgpu.mlir"); + if (failed(runCommand({*airOpt, inputFilename, + "-air-cross-rank-dma-to-mgpu", + "-air-gpu-channel-to-mgpu", + "-air-symmetric-alloc-to-mgpu", + "-air-rank-to-mgpu", "-o", step1.str().str()}))) + return failure(); + + // Step 2: Standard LLVM lowering. + std::string finalOutput; + if (!outputFilename.empty()) { + finalOutput = outputFilename; + } else { + SmallString<256> tmp(tmpDir); + sys::path::append(tmp, baseName + "_final.mlir"); + finalOutput = tmp.str().str(); + } + std::string llvmPipeline = + "--pass-pipeline=builtin.module(func.func(convert-scf-to-cf)," + "convert-to-llvm,reconcile-unrealized-casts)"; + if (failed(runCommand( + {*mlirOpt, step1.str().str(), llvmPipeline, "-o", finalOutput}))) + return failure(); + + if (verbose) + llvm::outs() << "Multi-GPU compilation complete! Output: " << finalOutput + << "\n" + << "Run with: bash test/gpu/symmetric_heap_dma/run.sh " + "(RANK/WORLD_SIZE/LOCAL_RANK env vars per process)\n"; + + if (outputFilename.empty()) { + auto bufOrErr = MemoryBuffer::getFile(finalOutput); + if (bufOrErr) + llvm::outs() << (*bufOrErr)->getBuffer(); + } + return success(); +} + static LogicalResult runGpuCompilation() { SmallString<256> baseName(sys::path::stem(inputFilename)); @@ -1675,6 +1751,8 @@ int main(int argc, char **argv) { // Dispatch based on target if (target.getValue() == "gpu") { + if (multiGpu) + return failed(runMultiGpuCompilation()) ? 1 : 0; return failed(runGpuCompilation()) ? 1 : 0; } else { return failed(runAieCompilation()) ? 1 : 0;