|
| 1 | +//===- air_sym_handwritten.mlir - hand-written multi-GPU e2e test --------===// |
| 2 | +// |
| 3 | +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. |
| 4 | +// SPDX-License-Identifier: MIT |
| 5 | +// |
| 6 | +//===------------------------------------------------------------------===// |
| 7 | +// |
| 8 | +// Hand-written reference IR exercising the symmetric-heap multi-GPU runtime |
| 9 | +// from MLIR. This is what the (future) air-rank-to-mgpu + cross-rank-DMA |
| 10 | +// lowering passes should produce. |
| 11 | +// |
| 12 | +// Each process executes this main once. With WORLD_SIZE=2: |
| 13 | +// 1. Init symmetric heap. |
| 14 | +// 2. Allocate a 1024xf32 symmetric buffer. |
| 15 | +// 3. Each rank fills its buffer with (rank + 1).0 from host. |
| 16 | +// 4. Barrier. |
| 17 | +// 5. Each rank reads peer's buffer via mgpuGetHeapBases()[peer]+offset, |
| 18 | +// copies it D2D into a local hipMalloc-style buffer, then D2H into a |
| 19 | +// host buffer, and verifies every element == (peer + 1).0. |
| 20 | +// 6. Print PASS / FAIL. |
| 21 | +// |
| 22 | +// Launcher: run.sh forks N processes with RANK / WORLD_SIZE / LOCAL_RANK. |
| 23 | +// |
| 24 | +//===------------------------------------------------------------------===// |
| 25 | + |
| 26 | +module { |
| 27 | + // ---- mgpu* C ABI declarations ----------------------------------------- |
| 28 | + func.func private @mgpuSymmetricHeapInit(i64) |
| 29 | + func.func private @mgpuSymmetricHeapDestroy() |
| 30 | + func.func private @mgpuGetRank() -> i32 |
| 31 | + func.func private @mgpuGetWorldSize() -> i32 |
| 32 | + func.func private @mgpuSymmetricAlloc(i64, !llvm.ptr) -> !llvm.ptr |
| 33 | + func.func private @mgpuSymmetricFree(!llvm.ptr, !llvm.ptr) |
| 34 | + func.func private @mgpuGetHeapBase(i32) -> !llvm.ptr |
| 35 | + func.func private @mgpuGetHeapBases() -> !llvm.ptr |
| 36 | + func.func private @mgpuBarrier() |
| 37 | + func.func private @mgpuMemAlloc(i64, !llvm.ptr, i1) -> !llvm.ptr |
| 38 | + func.func private @mgpuMemFree(!llvm.ptr, !llvm.ptr) |
| 39 | + func.func private @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) |
| 40 | + |
| 41 | + // libc helpers |
| 42 | + func.func private @malloc(i64) -> !llvm.ptr |
| 43 | + func.func private @free(!llvm.ptr) |
| 44 | + llvm.func @printf(!llvm.ptr, ...) -> i32 |
| 45 | + |
| 46 | + llvm.mlir.global internal constant @msg_init("[mlir] rank %d / world %d, init OK\0A\00") {addr_space = 0 : i32} |
| 47 | + llvm.mlir.global internal constant @msg_pass("[mlir] rank %d: cross-rank read PASS (peer=%d, expected=%.1f)\0A\00") {addr_space = 0 : i32} |
| 48 | + llvm.mlir.global internal constant @msg_fail("[mlir] rank %d: MISMATCH at idx=%ld got=%.1f expected=%.1f\0A\00") {addr_space = 0 : i32} |
| 49 | + llvm.mlir.global internal constant @msg_only1("[mlir] rank %d: world_size=1, skipping cross-rank read\0A\00") {addr_space = 0 : i32} |
| 50 | + llvm.mlir.global internal constant @msg_done("[mlir] rank %d: ALL PASSED\0A\00") {addr_space = 0 : i32} |
| 51 | + |
| 52 | + // ---- main ------------------------------------------------------------- |
| 53 | + func.func @main() { |
| 54 | + // Constants |
| 55 | + %c0_i32 = arith.constant 0 : i32 |
| 56 | + %c1_i32 = arith.constant 1 : i32 |
| 57 | + %c0_i64 = arith.constant 0 : i64 |
| 58 | + %c1_i64 = arith.constant 1 : i64 |
| 59 | + %c4_i64 = arith.constant 4 : i64 // sizeof(f32) |
| 60 | + %c1024_i64 = arith.constant 1024 : i64 // N |
| 61 | + %c4096_i64 = arith.constant 4096 : i64 // N * sizeof(f32) |
| 62 | + %heap_size = arith.constant 268435456 : i64 // 256 MB |
| 63 | + %nullptr = llvm.mlir.zero : !llvm.ptr |
| 64 | + %false = arith.constant false |
| 65 | + |
| 66 | + // Init symmetric heap (collective) |
| 67 | + func.call @mgpuSymmetricHeapInit(%heap_size) : (i64) -> () |
| 68 | + %rank = func.call @mgpuGetRank() : () -> i32 |
| 69 | + %world = func.call @mgpuGetWorldSize() : () -> i32 |
| 70 | + |
| 71 | + // printf("[mlir] rank %d / world %d, init OK\n", rank, world) |
| 72 | + %fmt_init = llvm.mlir.addressof @msg_init : !llvm.ptr |
| 73 | + llvm.call @printf(%fmt_init, %rank, %world) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32) -> i32 |
| 74 | + |
| 75 | + // Symmetric alloc 1024 floats |
| 76 | + %buf = func.call @mgpuSymmetricAlloc(%c4096_i64, %nullptr) : (i64, !llvm.ptr) -> !llvm.ptr |
| 77 | + |
| 78 | + // Allocate host buffer of 1024 floats and fill with (rank + 1).0 |
| 79 | + %hostbuf = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr |
| 80 | + %rank_plus1_i32 = arith.addi %rank, %c1_i32 : i32 |
| 81 | + %rank_plus1_f32 = arith.sitofp %rank_plus1_i32 : i32 to f32 |
| 82 | + %c0 = arith.constant 0 : index |
| 83 | + %c1 = arith.constant 1 : index |
| 84 | + %c1024 = arith.constant 1024 : index |
| 85 | + scf.for %i = %c0 to %c1024 step %c1 { |
| 86 | + %i_i64 = arith.index_cast %i : index to i64 |
| 87 | + %addr = llvm.getelementptr %hostbuf[%i_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 |
| 88 | + llvm.store %rank_plus1_f32, %addr : f32, !llvm.ptr |
| 89 | + } |
| 90 | + |
| 91 | + // mgpuMemcpy(buf, hostbuf, 4096, nullptr) // H2D |
| 92 | + func.call @mgpuMemcpy(%buf, %hostbuf, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () |
| 93 | + |
| 94 | + // Barrier so all ranks have written before any reads |
| 95 | + func.call @mgpuBarrier() : () -> () |
| 96 | + |
| 97 | + // If world_size > 1, read from peer = (rank + 1) % world |
| 98 | + %is_multi = arith.cmpi sgt, %world, %c1_i32 : i32 |
| 99 | + scf.if %is_multi { |
| 100 | + %sum = arith.addi %rank, %c1_i32 : i32 |
| 101 | + %peer = arith.remsi %sum, %world : i32 |
| 102 | + |
| 103 | + // bases = mgpuGetHeapBases() |
| 104 | + %bases = func.call @mgpuGetHeapBases() : () -> !llvm.ptr |
| 105 | + |
| 106 | + // peer_base = bases[peer] |
| 107 | + %peer_i64 = arith.extsi %peer : i32 to i64 |
| 108 | + %peer_base_addr = llvm.getelementptr %bases[%peer_i64] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr |
| 109 | + %peer_base = llvm.load %peer_base_addr : !llvm.ptr -> !llvm.ptr |
| 110 | + |
| 111 | + // local_base = bases[rank] |
| 112 | + %rank_i64 = arith.extsi %rank : i32 to i64 |
| 113 | + %local_base_addr = llvm.getelementptr %bases[%rank_i64] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.ptr |
| 114 | + %local_base = llvm.load %local_base_addr : !llvm.ptr -> !llvm.ptr |
| 115 | + |
| 116 | + // local_offset = (uintptr_t)buf - (uintptr_t)local_base |
| 117 | + %buf_int = llvm.ptrtoint %buf : !llvm.ptr to i64 |
| 118 | + %local_base_int = llvm.ptrtoint %local_base : !llvm.ptr to i64 |
| 119 | + %offset = arith.subi %buf_int, %local_base_int : i64 |
| 120 | + |
| 121 | + // peer_buf = (char*)peer_base + offset |
| 122 | + %peer_buf = llvm.getelementptr %peer_base[%offset] : (!llvm.ptr, i64) -> !llvm.ptr, i8 |
| 123 | + |
| 124 | + // Allocate a local D2D-target buffer via mgpuMemAlloc(N*sizeof(f32)) |
| 125 | + %local_copy = func.call @mgpuMemAlloc(%c4096_i64, %nullptr, %false) : (i64, !llvm.ptr, i1) -> !llvm.ptr |
| 126 | + |
| 127 | + // mgpuMemcpy(local_copy, peer_buf, 4096, nullptr) // D2D |
| 128 | + func.call @mgpuMemcpy(%local_copy, %peer_buf, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () |
| 129 | + |
| 130 | + // Allocate host readback and copy D2H |
| 131 | + %host_rb = func.call @malloc(%c4096_i64) : (i64) -> !llvm.ptr |
| 132 | + func.call @mgpuMemcpy(%host_rb, %local_copy, %c4096_i64, %nullptr) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () |
| 133 | + |
| 134 | + // Verify: every element == (peer + 1).0 |
| 135 | + %peer_plus1_i32 = arith.addi %peer, %c1_i32 : i32 |
| 136 | + %expected = arith.sitofp %peer_plus1_i32 : i32 to f32 |
| 137 | + |
| 138 | + %nfail_init = arith.constant 0 : i32 |
| 139 | + %nfail = scf.for %i = %c0 to %c1024 step %c1 |
| 140 | + iter_args(%nfail_acc = %nfail_init) -> (i32) { |
| 141 | + %i_i64 = arith.index_cast %i : index to i64 |
| 142 | + %addr = llvm.getelementptr %host_rb[%i_i64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 |
| 143 | + %v = llvm.load %addr : !llvm.ptr -> f32 |
| 144 | + %ne = arith.cmpf une, %v, %expected : f32 |
| 145 | + %new_nfail = scf.if %ne -> i32 { |
| 146 | + // Print first few mismatches |
| 147 | + %fmt_fail = llvm.mlir.addressof @msg_fail : !llvm.ptr |
| 148 | + %v64 = arith.extf %v : f32 to f64 |
| 149 | + %e64 = arith.extf %expected : f32 to f64 |
| 150 | + llvm.call @printf(%fmt_fail, %rank, %i_i64, %v64, %e64) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i64, f64, f64) -> i32 |
| 151 | + %inc = arith.addi %nfail_acc, %c1_i32 : i32 |
| 152 | + scf.yield %inc : i32 |
| 153 | + } else { |
| 154 | + scf.yield %nfail_acc : i32 |
| 155 | + } |
| 156 | + scf.yield %new_nfail : i32 |
| 157 | + } |
| 158 | + |
| 159 | + // If no failures, print PASS |
| 160 | + %ok = arith.cmpi eq, %nfail, %c0_i32 : i32 |
| 161 | + scf.if %ok { |
| 162 | + %fmt_pass = llvm.mlir.addressof @msg_pass : !llvm.ptr |
| 163 | + %e64 = arith.extf %expected : f32 to f64 |
| 164 | + llvm.call @printf(%fmt_pass, %rank, %peer, %e64) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32, f64) -> i32 |
| 165 | + } |
| 166 | + |
| 167 | + // Cleanup |
| 168 | + func.call @free(%host_rb) : (!llvm.ptr) -> () |
| 169 | + func.call @mgpuMemFree(%local_copy, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () |
| 170 | + } else { |
| 171 | + %fmt_only1 = llvm.mlir.addressof @msg_only1 : !llvm.ptr |
| 172 | + llvm.call @printf(%fmt_only1, %rank) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32 |
| 173 | + } |
| 174 | + |
| 175 | + func.call @mgpuBarrier() : () -> () |
| 176 | + |
| 177 | + // Cleanup |
| 178 | + func.call @free(%hostbuf) : (!llvm.ptr) -> () |
| 179 | + func.call @mgpuSymmetricFree(%buf, %nullptr) : (!llvm.ptr, !llvm.ptr) -> () |
| 180 | + func.call @mgpuSymmetricHeapDestroy() : () -> () |
| 181 | + |
| 182 | + %fmt_done = llvm.mlir.addressof @msg_done : !llvm.ptr |
| 183 | + llvm.call @printf(%fmt_done, %rank) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32 |
| 184 | + |
| 185 | + return |
| 186 | + } |
| 187 | +} |
0 commit comments