[multi-gpu] Phase 7: aircc integration (--multi-gpu flag)

erwei-xilinx · claude · erwei-xilinx · commit 00dba8404664 · 2026-05-06T20:15:43.000Z
Add `--multi-gpu` flag to `aircc` that selects the host-only multi-GPU
compilation pipeline:
  1. air-cross-rank-dma-to-mgpu  (Phase 5)
  2. air-gpu-channel-to-mgpu     (Phase 6)
  3. air-symmetric-alloc-to-mgpu (Phase 4)
  4. air-rank-to-mgpu            (Phase 3)
  5. convert-scf-to-cf + convert-to-llvm + reconcile-unrealized-casts

The output is host-only LLVM IR meant to be run as N processes via
`mlir-runner` linked against `libairgpu.so` (and `libmlir_rocm_runtime.so`)
with `RANK` / `WORLD_SIZE` / `LOCAL_RANK` env vars set.

The original Phase 7 plan included a `--multi-rank=N` runner mode that
forks N processes from `aircc` itself. That has been intentionally
deferred: the existing launcher in
`test/gpu/symmetric_heap_dma/run.sh` already does the multi-process
fork+wait pattern in ~30 lines of shell, and wrapping it into `aircc`
adds little value over that. Worth revisiting if real deployment
integration (SLURM, MPI, etc.) becomes a requirement.

- `tools/aircc/aircc.cpp` — adds `--multi-gpu` flag and
  `runMultiGpuCompilation()` function
- `test/gpu/symmetric_heap_dma/run.sh` — adds `INPUT=prelowered SRC=&lt;path&gt;`
  mode that takes `aircc --multi-gpu` output directly
- `docs/MultiGPUPlan.md` — Phase 7 section updated with the new design

- [x] `aircc --target=gpu --multi-gpu` builds and produces clean LLVM IR
  matching the structure of what `INPUT=channel` produces in `run.sh`
- [x] Compiled output uses `llvm.func @mgpuSymmetricHeapInit/Destroy/Get*`,
  `llvm.func @mgpuSymmetricAlloc/Free`, `llvm.func @mgpuMemcpy`,
  `llvm.func @mgpuBarrier`, `llvm.func @mgpuGetHeapBases` (verified
  via `head -20`)
- [ ] E2E run-through of `aircc --multi-gpu` output via `run.sh
  INPUT=prelowered`: deferred — SLURM allocation expired during
  testing. The compile pipeline is byte-for-byte equivalent to the
  manually-invoked `INPUT=channel` pipeline (which we verified PASS),
  so a regression is unlikely.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/test/gpu/symmetric_heap_dma/run.sh b/test/gpu/symmetric_heap_dma/run.sh
@@ -57,6 +57,7 @@ case "$INPUT" in
     mlir-opt "$TMPDIR/sym_post_translate.mlir" \
         --pass-pipeline='builtin.module(rocdl-attach-target{chip=gfx942 O=3},gpu.module(convert-scf-to-cf,convert-gpu-to-rocdl{chipset=gfx942 runtime=HIP},reconcile-unrealized-casts),gpu-module-to-binary,func.func(gpu-async-region,convert-scf-to-cf),gpu-to-llvm,convert-to-llvm,reconcile-unrealized-casts)' \
         -o "$TMPDIR/sym_lowered.mlir"
+    SKIP_LOWER=1
     ;;
   rank)
     # Host-orchestrated test: simple LLVM-only pipeline.
@@ -67,6 +68,7 @@ case "$INPUT" in
     mlir-opt "$TMPDIR/post_rank.mlir" \
         --pass-pipeline='builtin.module(func.func(convert-scf-to-cf),convert-to-llvm,reconcile-unrealized-casts)' \
         -o "$TMPDIR/sym_lowered.mlir"
+    SKIP_LOWER=1
     ;;
   alloc)
     SRC="$SCRIPT_DIR/air_sym_with_alloc.mlir"
@@ -92,10 +94,25 @@ case "$INPUT" in
     SRC="$TMPDIR/post_phase6.mlir"
     PIPE='builtin.module(func.func(convert-scf-to-cf),convert-to-llvm,reconcile-unrealized-casts)'
     ;;
+  prelowered)
+    # Pre-lowered MLIR file (e.g., output of `aircc --multi-gpu`).
+    # Path provided via SRC=path env var; bypass step 1.
+    if [ -z "${SRC:-}" ]; then
+      echo "INPUT=prelowered requires SRC=<path-to-lowered.mlir>" >&2
+      exit 1
+    fi
+    cp "$SRC" "$TMPDIR/sym_lowered.mlir"
+    SKIP_LOWER=1
+    ;;
   *)
-    echo "Unknown INPUT=$INPUT; expected 'handwritten', 'rank', 'alloc', 'dma', or 'channel'" >&2; exit 1;;
+    echo "Unknown INPUT=$INPUT; expected 'handwritten', 'rank', 'alloc', 'dma', 'channel', or 'prelowered'" >&2; exit 1;;
 esac
 
+if [ -z "${SKIP_LOWER:-}" ]; then
+  echo "Step 1c: Lower IR to LLVM dialect (INPUT=$INPUT)"
+  mlir-opt "$SRC" --pass-pipeline="$PIPE" -o "$TMPDIR/sym_lowered.mlir"
+fi
+
 echo "Step 2: Run as ${NUM_RANKS} processes"
 export AIRGPU_JOB_ID="${AIRGPU_JOB_ID:-$$}"
 
diff --git a/tools/aircc/aircc.cpp b/tools/aircc/aircc.cpp
@@ -179,6 +179,16 @@ static cl::opt<std::string>
                cl::desc("GPU runtime for ROCDL target (HIP or OpenCL)"),
                cl::init("HIP"), cl::cat(airCompilerOptions));
 
+static cl::opt<bool> multiGpu(
+    "multi-gpu",
+    cl::desc(
+        "When --target=gpu, lower air.rank / air.symmetric memref / cross-rank "
+        "air.dma_memcpy_nd / gpu_symmetric_heap air.channel ops to mgpu* "
+        "runtime calls. Produces host-only LLVM IR; the result must be run "
+        "as N processes (RANK / WORLD_SIZE / LOCAL_RANK env vars) linked "
+        "against libairgpu.so. See test/gpu/symmetric_heap_dma/run.sh."),
+    cl::init(false), cl::cat(airCompilerOptions));
+
 static cl::opt<bool>
     omitWhileTrueLoop("omit-while-true-loop",
                       cl::desc("Do not add while(true) loop around per-core "
@@ -707,6 +717,72 @@ static OwningOpRef<ModuleOp> cloneModule(ModuleOp moduleOp) {
 // GPU Compilation Pipeline
 //===----------------------------------------------------------------------===//
 
+// Multi-GPU host-only compilation pipeline. Lowers the high-level multi-GPU
+// abstractions (air.rank, air.symmetric memref, cross-rank air.dma_memcpy_nd,
+// gpu_symmetric_heap air.channel) to mgpu* runtime calls + standard LLVM.
+// Output is host-only LLVM IR meant to be run as N processes via mlir-runner
+// with RANK / WORLD_SIZE / LOCAL_RANK env vars set.
+static LogicalResult runMultiGpuCompilation() {
+  SmallString<256> baseName(sys::path::stem(inputFilename));
+
+  auto airOpt = sys::findProgramByName("air-opt");
+  auto mlirOpt = sys::findProgramByName("mlir-opt");
+  if (!airOpt) {
+    llvm::errs() << "Error: could not find air-opt in PATH\n";
+    return failure();
+  }
+  if (!mlirOpt) {
+    llvm::errs() << "Error: could not find mlir-opt in PATH\n";
+    return failure();
+  }
+
+  if (verbose) {
+    llvm::outs() << "Multi-GPU compilation for " << inputFilename << "\n";
+    llvm::outs() << "  Tmpdir: " << tmpDir << "\n";
+  }
+
+  // Step 1: Lower multi-GPU abstractions to mgpu* runtime calls.
+  // Order: cross-rank-DMA / channel first (they reference air.symmetric
+  // allocs that survive Phase 4), then symmetric-alloc, then rank.
+  SmallString<256> step1(tmpDir);
+  sys::path::append(step1, baseName + "_mgpu.mlir");
+  if (failed(runCommand({*airOpt, inputFilename,
+                          "-air-cross-rank-dma-to-mgpu",
+                          "-air-gpu-channel-to-mgpu",
+                          "-air-symmetric-alloc-to-mgpu",
+                          "-air-rank-to-mgpu", "-o", step1.str().str()})))
+    return failure();
+
+  // Step 2: Standard LLVM lowering.
+  std::string finalOutput;
+  if (!outputFilename.empty()) {
+    finalOutput = outputFilename;
+  } else {
+    SmallString<256> tmp(tmpDir);
+    sys::path::append(tmp, baseName + "_final.mlir");
+    finalOutput = tmp.str().str();
+  }
+  std::string llvmPipeline =
+      "--pass-pipeline=builtin.module(func.func(convert-scf-to-cf),"
+      "convert-to-llvm,reconcile-unrealized-casts)";
+  if (failed(runCommand(
+          {*mlirOpt, step1.str().str(), llvmPipeline, "-o", finalOutput})))
+    return failure();
+
+  if (verbose)
+    llvm::outs() << "Multi-GPU compilation complete! Output: " << finalOutput
+                 << "\n"
+                 << "Run with: bash test/gpu/symmetric_heap_dma/run.sh "
+                    "(RANK/WORLD_SIZE/LOCAL_RANK env vars per process)\n";
+
+  if (outputFilename.empty()) {
+    auto bufOrErr = MemoryBuffer::getFile(finalOutput);
+    if (bufOrErr)
+      llvm::outs() << (*bufOrErr)->getBuffer();
+  }
+  return success();
+}
+
 static LogicalResult runGpuCompilation() {
   SmallString<256> baseName(sys::path::stem(inputFilename));
 
@@ -1675,6 +1751,8 @@ int main(int argc, char **argv) {
 
   // Dispatch based on target
   if (target.getValue() == "gpu") {
+    if (multiGpu)
+      return failed(runMultiGpuCompilation()) ? 1 : 0;
     return failed(runGpuCompilation()) ? 1 : 0;
   } else {
     return failed(runAieCompilation()) ? 1 : 0;