Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
0c3d9e7
[multi-gpu] Phase 2: hand-written e2e test for symmetric-heap multi-GPU
erwei-xilinx May 3, 2026
6603622
[multi-gpu] Phase 2: remove SHARE_GPU; fail-fast precondition
erwei-xilinx May 5, 2026
186cbf1
[multi-gpu] Phase 2: air.translate op + air-translate-to-llvm lowering
erwei-xilinx May 5, 2026
3ae4f07
[multi-gpu] Phase 2: kernel-driven producer/consumer rewrite
erwei-xilinx May 6, 2026
e24f5b0
[multi-gpu] Phase 2: fix CI failures (REQUIRES:gpu + clang-format-17)
erwei-xilinx May 6, 2026
fb1061d
[multi-gpu] Phase 2: air.translate uses memref<?xindex>, not !llvm.ptr
erwei-xilinx May 6, 2026
1a41079
[multi-gpu] Phase 2: drop dead args/locals in handwritten test
erwei-xilinx May 6, 2026
3208aed
[multi-gpu] Phase 2: full-loop verify, fail-loud exit, syncscope test
erwei-xilinx May 6, 2026
281b407
[multi-gpu] Phase 2: collapse 3 wrap_* helpers into one wrap_bytes
erwei-xilinx May 6, 2026
977767d
[multi-gpu] Phase 2: factor flag_slot_ptr helper; document memref ato…
erwei-xilinx May 6, 2026
4fba2bc
[multi-gpu] Phase 2: spell System syncscope explicitly on atomics
erwei-xilinx May 12, 2026
8ad56da
[multi-gpu] Phase 2: drop redundant gpu.barrier in consumer kernel
erwei-xilinx May 12, 2026
13dbb7d
[multi-gpu] Phase 2: split handwritten test into atomic + cacheline v…
erwei-xilinx May 12, 2026
f6984de
[multi-gpu] Phase 2: drop world_size=1 graceful skip; require >= 2 ranks
erwei-xilinx May 12, 2026
866a74c
[multi-gpu] Phase 3: air-rank-to-mgpu lowering pass
erwei-xilinx May 3, 2026
689ca78
[multi-gpu] Phase 4: air-symmetric-alloc-to-mgpu lowering pass
erwei-xilinx May 3, 2026
4a22a25
[multi-gpu] Phase 5: air-cross-rank-dma-to-mgpu lowering pass
erwei-xilinx May 3, 2026
d09e741
[multi-gpu] Phase 6: air-gpu-channel-to-mgpu lowering pass
erwei-xilinx May 3, 2026
e9a1fc6
[multi-gpu] Phase 7: aircc integration (--multi-gpu flag)
erwei-xilinx May 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions mlir/include/air/Conversion/AIRCrossRankDmaToMgpuPass.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//===- AIRCrossRankDmaToMgpuPass.h ------------------------------*- C++ -*-===//
//
// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
// SPDX-License-Identifier: MIT
//
//===-----------------------------------------------------------------------===//

#ifndef AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H
#define AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H

#include "mlir/Pass/Pass.h"
#include <memory>

namespace xilinx {
namespace air {

std::unique_ptr<mlir::Pass> createAIRCrossRankDmaToMgpuPass();

} // namespace air
} // namespace xilinx

#endif // AIR_CONVERSION_AIR_CROSS_RANK_DMA_TO_MGPU_PASS_H
22 changes: 22 additions & 0 deletions mlir/include/air/Conversion/AIRGpuChannelToMgpuPass.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//===- AIRGpuChannelToMgpuPass.h --------------------------------*- C++ -*-===//
//
// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
// SPDX-License-Identifier: MIT
//
//===-----------------------------------------------------------------------===//

#ifndef AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H
#define AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H

#include "mlir/Pass/Pass.h"
#include <memory>

namespace xilinx {
namespace air {

std::unique_ptr<mlir::Pass> createAIRGpuChannelToMgpuPass();

} // namespace air
} // namespace xilinx

#endif // AIR_CONVERSION_AIR_GPU_CHANNEL_TO_MGPU_PASS_H
22 changes: 22 additions & 0 deletions mlir/include/air/Conversion/AIRRankToMgpuPass.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//===- AIRRankToMgpuPass.h ---------------------------------------*- C++ -*-===//
//
// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
// SPDX-License-Identifier: MIT
//
//===-----------------------------------------------------------------------===//

#ifndef AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H
#define AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H

#include "mlir/Pass/Pass.h"
#include <memory>

namespace xilinx {
namespace air {

std::unique_ptr<mlir::Pass> createAIRRankToMgpuPass();

} // namespace air
} // namespace xilinx

#endif // AIR_CONVERSION_AIR_RANK_TO_MGPU_PASS_H
22 changes: 22 additions & 0 deletions mlir/include/air/Conversion/AIRSymmetricAllocToMgpuPass.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//===- AIRSymmetricAllocToMgpuPass.h ----------------------------*- C++ -*-===//
//
// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
// SPDX-License-Identifier: MIT
//
//===-----------------------------------------------------------------------===//

#ifndef AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H
#define AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H

#include "mlir/Pass/Pass.h"
#include <memory>

namespace xilinx {
namespace air {

std::unique_ptr<mlir::Pass> createAIRSymmetricAllocToMgpuPass();

} // namespace air
} // namespace xilinx

#endif // AIR_CONVERSION_AIR_SYMMETRIC_ALLOC_TO_MGPU_PASS_H
22 changes: 22 additions & 0 deletions mlir/include/air/Conversion/AIRTranslateToLLVMPass.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//===- AIRTranslateToLLVMPass.h --------------------------------*- C++ -*-===//
//
// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
// SPDX-License-Identifier: MIT
//
//===-----------------------------------------------------------------------===//

#ifndef AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H
#define AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H

#include "mlir/Pass/Pass.h"
#include <memory>

namespace xilinx {
namespace air {

std::unique_ptr<mlir::Pass> createAIRTranslateToLLVMPass();

} // namespace air
} // namespace xilinx

#endif // AIR_CONVERSION_AIR_TRANSLATE_TO_LLVM_PASS_H
5 changes: 5 additions & 0 deletions mlir/include/air/Conversion/GPUPassDetail.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,13 @@ namespace air {
using namespace mlir;

#define GEN_PASS_DECL
#define GEN_PASS_DEF_AIRTRANSLATETOLLVM
#define GEN_PASS_DEF_CONVERTAIRTOROCDL
#define GEN_PASS_DEF_CONVERTGPUKERNELOUTLINE
#define GEN_PASS_DEF_AIRRANKTOMGPU
#define GEN_PASS_DEF_AIRSYMMETRICALLOCTOMGPU
#define GEN_PASS_DEF_AIRCROSSRANKDMATOMGPU
#define GEN_PASS_DEF_AIRGPUCHANNELTOMGPU
#include "air/Conversion/GPUPasses.h.inc"

} // namespace air
Expand Down
120 changes: 120 additions & 0 deletions mlir/include/air/Conversion/GPUPasses.td
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,23 @@ def ConvertAIRToROCDL : Pass<"air-to-rocdl", "ModuleOp"> {
let options = [];
}

def AIRTranslateToLLVM : Pass<"air-translate-to-llvm", "ModuleOp"> {
let summary = "Lower air.translate to memref.reinterpret_cast + LLVM-dialect address arithmetic";
let description = [{
Expands each `air.translate` op into the pointer-rebase computation:
`bases[to_rank] - bases[from_rank]`, converted from bytes to elements
of the source memref's element type, then applied as a new offset
via `memref.reinterpret_cast`. The expansion is pure arithmetic; it
works identically on host functions and inside `gpu.func`.
}];
let constructor = "xilinx::air::createAIRTranslateToLLVMPass()";
let dependentDialects = [
"mlir::arith::ArithDialect",
"mlir::memref::MemRefDialect",
"mlir::LLVM::LLVMDialect"
];
}

def ConvertGPUKernelOutline : Pass<"air-gpu-outlining", "ModuleOp"> {
let summary = "Outline GPU Kernel Func from GPU Launch";
let constructor = "xilinx::air::createGPUKernelOutlinePass()";
Expand All @@ -32,4 +49,107 @@ def ConvertGPUKernelOutline : Pass<"air-gpu-outlining", "ModuleOp"> {
let options = [];
}

def AIRGpuChannelToMgpu : Pass<"air-gpu-channel-to-mgpu", "ModuleOp"> {
let summary = "Lower air.channel.put/get of channel_type=\"gpu_symmetric_heap\" "
"to host-side mgpuMemcpy (peer-VA) + mgpuBarrier";
let constructor = "xilinx::air::createAIRGpuChannelToMgpuPass()";
let description = [{
For each `air.channel @C [...] {channel_type = "gpu_symmetric_heap"}`,
pair its single `air.channel.put` and single `air.channel.get`. The put
becomes `mgpuBarrier()` (publish: data is already in the symmetric heap
via the put's `air.symmetric` source memref). The get becomes
`mgpuBarrier()` followed by `mgpuMemcpy(dst, peer_va(put_src), size)`
where the peer rank is the get's first index operand and the peer VA is
computed via `mgpuGetHeapBases()`.

Restrictions in this initial version:
- One put and one get per channel symbol.
- Both put and get at host scope (no `gpu.launch`/`gpu.func`).
- put's source memref must be `air.symmetric`-tagged.
- get's destination memref must be in `memory_space=0`.
- "Entire memref" form only on both sides.
- get must take exactly one index operand (the peer rank).
}];
let dependentDialects = [
"func::FuncDialect", "arith::ArithDialect", "memref::MemRefDialect",
"LLVM::LLVMDialect"
];
}

def AIRCrossRankDmaToMgpu : Pass<"air-cross-rank-dma-to-mgpu", "ModuleOp"> {
let summary = "Lower air.dma_memcpy_nd with src_rank/dst_rank to mgpuMemcpy "
"with peer-VA addressing through mgpuGetHeapBases()";
let constructor = "xilinx::air::createAIRCrossRankDmaToMgpuPass()";
let description = [{
For each `air.dma_memcpy_nd` op carrying a `src_rank` or `dst_rank`
integer attribute, emit a host-side `mgpuMemcpy` whose peer-side pointer
is computed as `mgpuGetHeapBases()[peer] + (local_ptr - local_base)`.

Restrictions in this initial version:
- Both `src` and `dst` memrefs must be in `memory_space=0`.
- The op must be at host scope (not inside any `gpu.launch`/`gpu.func`).
- "Entire memref" form only: `[]` `[]` `[]` for both sides — no
custom offsets / sizes / strides.

Lower this pass *before* `air-symmetric-alloc-to-mgpu` so that pointer
extraction (`memref.extract_aligned_pointer_as_index`) sees plain
memrefs rather than already-cast LLVM struct values.
}];
let dependentDialects = [
"func::FuncDialect", "arith::ArithDialect", "memref::MemRefDialect",
"LLVM::LLVMDialect"
];
}

def AIRSymmetricAllocToMgpu : Pass<"air-symmetric-alloc-to-mgpu", "ModuleOp"> {
let summary = "Lower memref.alloc {air.symmetric} to mgpuSymmetricAlloc and "
"memref.dealloc of the result to mgpuSymmetricFree";
let constructor = "xilinx::air::createAIRSymmetricAllocToMgpuPass()";
let description = [{
Replaces each `memref.alloc` carrying the unit attribute `air.symmetric`
with a call to `mgpuSymmetricAlloc(size_in_bytes, stream)` returning
`!llvm.ptr`, then builds an LLVM memref descriptor (struct) wrapping that
pointer and projects it back to the original memref type via
`builtin.unrealized_conversion_cast` so downstream uses keep working.

For every `memref.dealloc` whose operand traces back (through a single
`unrealized_conversion_cast`) to such a symmetric alloc, the pass emits
`mgpuSymmetricFree(ptr, stream)` and erases the dealloc.

Should run before `convert-to-llvm`. Does nothing if no `air.symmetric`
allocations are present.
}];
let dependentDialects = [
"func::FuncDialect", "arith::ArithDialect", "LLVM::LLVMDialect"
];
}

def AIRRankToMgpu : Pass<"air-rank-to-mgpu", "ModuleOp"> {
let summary = "Lower air.rank to mgpu* runtime calls (multi-GPU process model)";
let constructor = "xilinx::air::createAIRRankToMgpuPass()";
let description = [{
Each `air.rank` op is replaced by inlining its body in place, with rank
IDs computed from `mgpuGetRank()` (delinearized into the rank's N-D
iteration space) and rank sizes substituted from the static size operands.

The pass also inserts `mgpuSymmetricHeapInit(heap_size)` at the entry of
the enclosing `func.func` (default 256 MB; configurable via the
`heap-size` option) and `mgpuSymmetricHeapDestroy()` before each
`func.return` in that function.

This replaces `air-rank-to-launch` for the GPU pipeline. Unlike
`air-rank-to-launch` (which serializes ranks via `scf.for`), this pass
assumes each process executes the whole rank body once and runtime
coordinates across processes via env vars (RANK / WORLD_SIZE / LOCAL_RANK)
and the symmetric-heap fabric.
}];
let options = [
Option<"heapSize", "heap-size", "uint64_t", "/*default=*/268435456",
"Symmetric heap size in bytes (default: 256 MB)">
];
let dependentDialects = [
"func::FuncDialect", "arith::ArithDialect"
];
}

#endif // AIR_CONVERSION_GPU_PASSES
37 changes: 37 additions & 0 deletions mlir/include/air/Dialect/AIR/AIR.td
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,43 @@ def air_ExecuteTerminatorOp : air_Op<"execute_terminator", [HasParent<"ExecuteOp
[{ attr-dict ($results^ `:` type($results))? }];
}

def air_TranslateOp : air_Op<"translate",
[Pure, AllTypesMatch<["source", "result"]>]>,
Arguments<(ins AnyMemRef:$source,
Index:$from_rank,
Index:$to_rank,
MemRefRankOf<[Index], [1]>:$heap_bases)>,
Results<(outs AnyMemRef:$result)> {
let summary = "Re-express a symmetric-heap memref in another rank's address space";
let description = [{
Produces a memref of the same type as `$source` whose underlying
pointer references the corresponding allocation on `$to_rank`. The
`$source` memref is assumed to live on `$from_rank`'s symmetric heap.
The translation is the pointer rebase

peer_va = bases[to_rank] + (source_ptr - bases[from_rank])

where `$heap_bases` is a 1-D memref of `index`-typed pointer values
(per-rank symmetric-heap base addresses) obtained from the
`mgpuGetHeapBases()` runtime hook. The host typically wraps the raw
runtime pointer as a `memref<?xindex>` once and threads it through
`gpu.launch_func` as a kernel argument. No data is moved; this op
produces a value-level "view" of peer memory.

Folds to `$source` when `$from_rank` and `$to_rank` are statically
equal.

Both ranks must address the same collective allocation on the
symmetric heap (i.e. `$source` must trace back to a
`memref.alloc {air.symmetric}`). Using this op outside that contract
is undefined.
}];
let assemblyFormat =
[{ $source `,` $from_rank `,` $to_rank `,` $heap_bases
attr-dict `:` type($source) `,` type($heap_bases) }];
let hasFolder = 1;
}

// AIR custom op, as a handle for a user-provided AIE kernel

def air_CustomOp : air_Op<"custom", [air_AsyncOpInterface,
Expand Down
Loading
Loading